数据结构代码相似度检测
思路:首先,对要进行比较的所有代码段进行词法分析,并转化为特定的标记(token)串,自己制定一个转换规则。接着,通过两两比较标记(token)串来确定代码之间的相似性,并由此确定代码之间抄袭的程度,过程如图2.1所示。
图2.1 源代码复制检测过程示意图
将这两个代码分别转换为token串后,基于算法
RKR-GST( running-karp-rabin greedy-string-tiling)算法思想,循环求取两个标记串中未被匹配部分的最大公共子串,将其用空格代替,并根据如下公式求出两个token串A,B的相似度:
(2.3)
过程论述
流程图
首先要对整个设计做一个整体规划,即设计一个流程图,如图3.1。
图3.1 程序流程图
程序清单
#include #include #include #include #include #include #include #define N 10000 #define M 10000
#define MAXSTRLEN 10000 //定义最大串长
typedef int status;
typedef unsigned char SString[MAXSTRLEN+1]; //串的定长顺序存储表示 SString a[3]={"int","long","short"}; SString b[2]={"float","double"}; SString
c[15]={"&&","||","++","--","+","-","*","/","=",">=","","
SString d[12]={"[","]","{","}","(",")",",",";","'","#",";","."}; SString
e[29]={"auto","break","case","char","const","continue","default","do","else","enum",
"extern","for","goto","if","main","printf","register","return","signed","sizeof",
"static","struct","switch","typedef","union","unsigned","void","while","volatile"};
HANDLE hOut; DWORD written;
void ShadowWindowLine(char *str); char type(char *str);
void token(char name[],char list[],char token[],FILE *table); void simple(int MinMatchLen,FILE *fp1,FILE *fp2); status replace(SString s,int pos,int len,int Ls); int copy(float n);
void ShadowWindowLine(char *str) {
SMALL_RECT rc;
CONSOLE_SCREEN_BUFFER_INFO bInfo; // 窗口缓冲区信息 WORD att0,att1,attBack; int i, chNum = strlen(str);
GetConsoleScreenBufferInfo( hOut, &bInfo ); // 获取窗口缓冲区信息
// 计算显示窗口大小和位置
rc.Left = (bInfo.dwSize.X - chNum)/2 - 2;
rc.Top = 8; // 原代码段中此处为bInfo.dwSize.Y/2 - 2,但是如果您的DOS屏幕有垂直滚动条的话,还需要把滚动条下拉才能看到,为了方便就把它改为10
rc.Right = rc.Left + chNum + 4; rc.Bottom = rc.Top + 4;
att0 = BACKGROUND_RED |BACKGROUND_BLUE; // 阴影属性
att1 = FOREGROUND_RED |FOREGROUND_GREEN |FOREGROUND_BLUE | FOREGROUND_INTENSITY
|
BACKGROUND_RED
|
BACKGROUND_BLUE
|
BACKGROUND_INTENSITY;// 文本属性
attBack = BACKGROUND_RED |BACKGROUND_GREEN |BACKGROUND_BLUE | BACKGROUND_INTENSITY; // 背景属性 // 设置阴影然后填充
COORD posShadow = {rc.Left+1, rc.Top+1}, posText = {rc.Left, rc.Top},posBack={0,0}; for (i=0;i
FillConsoleOutputAttribute(hOut, &written);
posBack.Y++; }
for (i=0; i
FillConsoleOutputAttribute(hOut, att0, chNum + 4, posShadow, &written);
posShadow.Y++; }
for (i=0;i
FillConsoleOutputAttribute(hOut, att1,chNum + 4, posText, &written);
posText.Y++; }
// 写文本和边框
posText.X = rc.Left + 2;
attBack,80, posBack,
posText.Y = rc.Top + 2;
WriteConsoleOutputCharacter(hOut, str, strlen(str), posText, &written);
SetConsoleTextAttribute(hOut, bInfo.wAttributes); // 恢复原来的属性 }
char type(char *str) //{
int i;
for(i=0;i
if(strcmp(str,a[i])==0) return 'K'; }
for(i=0;i
if(strcmp(str,b[i])==0) return 'E'; }
for(i=0;i
if(strcmp(str,c[i])==0) return 'A'; }
for(i=0;i
if(strcmp(str,d[i])==0) return 'R'; }
for(i=0;i
if(strcmp(str,e[i])==0) return 'Y'; }
此函数判断单词类型 中的关键字 中的关键字 中的符号 中符号 中的关键字
if(isdigit(str[0])) //0-9是数字 {
return 'N'; }
//一般的变量与字符 if(!isalnum(str[0])) return 'H'; else return 'C';//变量 }
void token(char name[],char list[],char token[],FILE *table) //个文件中的字符串分别切割转换为token串 {
FILE *in,*out;
char ch,c,buffer[N],*link[M]; int i=0,j=0,k=0,LenLink=0; if((in=fopen(name,"r+"))==NULL) {
printf("源文件无法打开!\n"); exit(0); }
if((out=fopen(list,"w+"))==NULL) {
printf("文件写入失败!\n"); exit(0); }
if((table=fopen(token,"w+"))==NULL) {
printf("文件写入失败!\n"); exit(0); }
while(!feof(in)) //逐字读取文件 {
将两
ch=fgetc(in);
if(ch=='\t' || ch==' ' || ch== '\n') //去掉空格、制表符、回车 continue;
if(isalpha(ch)) //如果首字符是字母 {
while(isalnum(ch)&&(i
buffer[i++]=ch; ch=fgetc(in); }
buffer[i]='\0'; link[j++]=(char
*)malloc(sizeof(char)*(strlen(buffer)+1)); strcpy(link[j-1],buffer); i=0;
fseek(in,-1L,1); //在文件当中定位 }
else if(isdigit(ch)) //如果首字符是数字 {
while(isalnum(ch)&&(i
buffer[i++]=ch; ch=fgetc(in); }
buffer[i]='\0'; link[j++]=(char
*)malloc(sizeof(char)*(strlen(buffer)+1)); strcpy(link[j-1],buffer); i=0;
fseek(in,-1L,1); }
else if(!isalnum(ch)) //如果首字符既不是数字也不是字母 {
if(ch!='\n'&&ch!=' '&&ch!='\t') {
if(ch=='>'||ch=='
if((c=fgetc(in))=='=') //>=,
{
buffer[i++]=ch; buffer[i++]=c; buffer[i]='\0';
link[j++]=(char *)malloc(sizeof(char)*3); strcpy(link[j-1],buffer); i=0; } else {
buffer[i++]=ch; buffer[i]='\0';
link[j++]=(char *)malloc(sizeof(char)*2); strcpy(link[j-1],buffer); i=0;
fseek(in,-1L,1); } }
else if(ch=='+'||ch=='-'||ch=='&'||ch=='|'||ch=='=') {
if((c=fgetc(in))==ch) //++,--,&&,||,==这些需被认为是一个符号 {
buffer[i++]=ch; buffer[i++]=c; buffer[i]='\0';
link[j++]=(char *)malloc(sizeof(char)*3); strcpy(link[j-1],buffer); i=0; } else
{
buffer[i++]=ch; buffer[i]='\0';
link[j++]=(char *)malloc(sizeof(char)*2); strcpy(link[j-1],buffer); i=0;
fseek(in,-1L,1); } }
else //其他符号 {
buffer[i++]=ch; buffer[i]='\0';
link[j++]=(char *)malloc(sizeof(char)*2); strcpy(link[j-1],buffer); i=0; } } } }
LenLink = j-1; //存到link中的总长度
for(i=0;i
c=type(link[i]); //
if(c=='N'||c=='A'||c=='R')//数字,符号在表中保留 fputs(link[i],table); if(c=='C') //变量均替换为id fputs("id",table);
if(c=='K')//关键字int,short,long替换为zh fputs("zh",table);
if(c=='E')//关键字float,double替换为fu fputs("fu",table); if(c=='Y')//其他关键字不变 fputs(link[i],table);
if(c=='H')//汉字删掉 fputs("\0",table); }
fclose(table);
fprintf(out,"\t***** 单词类型观察表 *****\n");//打印list中的内容
fprintf(out,"\t K --int,short,long \n"); fprintf(out,"\t E --float,double\n"); fprintf(out,"\t Y --其他关键字\n"); fprintf(out,"\t A --运算符号\n"); fprintf(out,"\t R --语言符号\n"); fprintf(out,"\t N --数字\n"); fprintf(out,"\t H --汉字\n");
fprintf(out,"\t C --一般变量或标识符\n"); fprintf(out,"\t*****************************\n");
for(i=0;i
c=type(link[i]); //判断单词的类型 fputc('(',out); fputc(c,out); fputc(',',out); fputs(link[i],out); fputc(',',out); fprintf(out,"%d",i); fputc(')',out); fputc('\n',out); } }
void simple(int MinMatchLen,FILE *fp1,FILE *fp2)//此函数计算相似度,MinMatchLen: 公共子串要达到的最小长度 {
SString A,B;
int i=0,j=0,k,t,s,a=1,La,Lb,lena,lenb,x,y;
float n;
int MatchLen=0;//所有公共子串的总长度
int maxmatch;//当前最大公共子串长度
if ((fp1=fopen("f:\\token1.txt","r"))==NULL)//设定文件位于当前目录下,可更改为绝对路径
{
printf("文件打开失败!");
getch();
exit(0);
}
A[++i]=fgetc(fp1);
while(!feof(fp1))
A[++i]=fgetc(fp1);
fclose(fp1);
La=i-1;
printf("token串1长度为%d,",La);
if ((fp2=fopen("f:\\token2.txt","r"))==NULL)//设定文件位于当前目录下,可更改为绝对路径
{
printf("文件打开失败!");
getch();
exit(0);
}
B[++j]=fgetc(fp2);
while(!feof(fp2))
B[++j]=fgetc(fp2);
fclose(fp2);
Lb=j-1;
printf("token串2长度为%d\n",Lb);
printf("是否要查看这两个token串?Y/N ");
if(h=='Y'||h=='y')
{
ShellExecute(NULL,"open","F:\\token1.txt",NULL,NULL,SW_SHOWNORMAL);
ShellExecute(NULL,"open","F:\\token2.txt",NULL,NULL,SW_SHOWNORMAL); }
getchar();
printf("\n将超过指定长度的公共子串用空格替换,是否要查看细节?Y/N ");
ch=getchar();
lena=i-1;
lenb=j-1;
do
{
maxmatch=MinMatchLen;
for(i=1;i
{
for(j=1;j
{
k=0;
while((k
//串A的第i+k个字符与串B的第j+k个字符是否相等
k++;
if(k>maxmatch)
{
maxmatch=k;
x=i;
y=j;
}
}
}
if(maxmatch>MinMatchLen)
{
replace(A,x,maxmatch,La);
replace(B,y,maxmatch,Lb);
La=La-maxmatch+1;
Lb=Lb-maxmatch+1;
MatchLen+=maxmatch;
}
if(ch=='Y'||ch=='y')
{
printf("第%d次检查两串中的匹配串\n",a);
a++;
for(s=1;s
printf("%c",A[s]);
printf("\n");
for(s=1;s
printf("%c",B[s]);
printf("\n");
}
}
while(maxmatch>MinMatchLen);
printf("\n已经没有能够匹配的公共子串了\n");
n=(2.0*MatchLen)/(lena+lenb);
printf("公共子串的总长为%d,",MatchLen);
printf("根据公式\n");
printf("\t\t
——————————————————————————\n");
printf("\t\t| 相似度=(2×公共子串长度)÷(串A长度+串B长度) |\n");
printf("\t\t
——————————————————————————\n");
printf("这两串代码的相似度为%f\n",n);
copy(n);
}
status replace(SString s,int pos,int len,int Ls) //用空格来代替两个token串中的最大匹配子串
{
int i;
if(posLs-len+1||len
return 0;
s[pos]=' ';
for(i=pos+len;i
{
s[i-len+1]=s[i];
}
return 1;
}
int copy(float n) //此函数判断是否抄袭
{
printf("\n相似度超过0.8,则认为是抄袭");
if(n>=0.8)
printf("\n这两个代码有抄袭嫌疑,请做进一步检查");
else
printf("\n这两个代码没有抄袭嫌疑");
return 0;
}
void main(void)
{
hOut = GetStdHandle(STD_OUTPUT_HANDLE); // 获取标准输出设备句柄 SetConsoleOutputCP(936); // 设置代码页,此为中文简体
ShadowWindowLine(" 欢迎使用C语言代码复制/相似度检测软件 "); getchar();
system("cls"); //清屏
char name1[50];
char name2[50]; //存储输入的文件路径字符串
FILE *f1,*f2;
system("color F3");
printf("\n代码1:");
scanf("%s",name1);
token(name1,"f:\\list1.txt","f:\\token1.txt",f1); printf("代码2:");
scanf("%s",name2);
token(name2,"f:\\list2.txt","f:\\token2.txt",f2); printf("\ntoken串已生成成功,");
getchar();
simple(3,f1,f2);
}