0% found this document useful (0 votes)
245 views6 pages

Conflation

The document describes a conflation algorithm to process text documents. The algorithm performs the following steps: 1. Displays the original file contents. 2. Removes punctuation marks from the text. 3. Removes high frequency words from the text. 4. Performs suffix stripping to remove common suffixes from words. 5. Detects equivalent stems remaining after suffix stripping. 6. Generates a representation of the original text after processing. The algorithm opens multiple files to read input text and write output at each processing step. It provides menus to demonstrate each step and writes the results to files.

Uploaded by

Pratik B
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
245 views6 pages

Conflation

The document describes a conflation algorithm to process text documents. The algorithm performs the following steps: 1. Displays the original file contents. 2. Removes punctuation marks from the text. 3. Removes high frequency words from the text. 4. Performs suffix stripping to remove common suffixes from words. 5. Detects equivalent stems remaining after suffix stripping. 6. Generates a representation of the original text after processing. The algorithm opens multiple files to read input text and write output at each processing step. It provides menus to demonstrate each step and writes the results to files.

Uploaded by

Pratik B
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 6

Aim:-Conflation Algorithm

#include<stdio.h>
#include<conio.h>
#include<stdlib.h>
#include<string.h>
void orig_file();
void punct_remove();
void freq_words_remove();
void suffix_strip();
void equi_stem();
void stem();
void main()
{
int ans,ch;
FILE *fp,*fp1,*fp2,*fp3,*fp4,*fp5,*fp6,*fp7;
clrscr();
fp=fopen("Filenew.txt","r+");
fp1=fopen("Wo_punct.txt","w+");
fp2=fopen("Stops.txt","r+");
fp3=fopen("Wo_freq.txt","w+");
fp4=fopen("Suffix_list.txt","r+");
fp5=fopen("Wo_suffix.txt","w+");
fp6=fopen("Doc_copy.txt","w+");
fp7=fopen("Doc_rep.txt","w+");
if(fp==NULL || fp1==NULL || fp2==NULL || fp3==NULL || fp4==NULL || fp5==NULL
|| fp6==NULL || fp7==NULL)
{
printf("\nError in opening file!!!\n");
getch();
exit(0);
}
printf("\t\t --------CONFLATION ALGORTIHM SIMULATION--------\n");
do
{
printf("\nMENU:\n=====\n");
printf("\n1.Display the Original File contents.");
printf("\n2.Remove Punctuation Marks.");
printf("\n3.Remove the High Frequency Words.");
printf("\n4.Perform Suffix Stripping.");
printf("\n5.Detect Equivalent Stems.");
printf("\n6.Document Representation of the original file.");
printf("\n7.Exit.");
printf("\nEnter your Choice: ");
scanf("%d",&ch);
switch(ch)
{
case 1: printf("\n\t\t\t\tORIGINAL DOCUMENT\n");
printf("\t\t\t\t-----------------\n");
orig_file(fp);
break;
case 2: printf("\n\t\t\tAfter removal of Punctuation Marks\n");
printf("\t\t\t----------------------------------\n");
punct_remove(fp,fp1);
break;
case 3: printf("\n\t\t\tAfter Removal of High Frequency
Words\n");

printf("\t\t\t-------------------------------------\n");
freq_words_remove(fp1,fp2,fp3);
break;
case 4: printf("\n\t\t\tAfter Suffix Stripping\n");
printf("\t\t\t----------------------\n");
suffix_strip(fp3,fp4,fp5);
break;
case 5: printf("\n\t\t\tAfter Detecting Equivalent Stems\n");
printf("\t\t\t--------------------------------\n");
equi_stem(fp5,fp6,fp7);
stem(fp6,fp7);
break;
case 6: printf("\n\t\t\t DOCUMENT REPRESENTATION\n");
printf("\t\t\t -----------------------\n");
orig_file(fp7);
break;
case 7: exit(1);
default: printf("\nINVALID INPUT!!!");
break;
}
printf("\n\nDo You Want To Continue?\n1.YES\t\t0.NO\n");
scanf("%d",&ans);
}while(ans==1);
fclose(fp);
fclose(fp1);
fclose(fp2);
fclose(fp3);
fclose(fp4);
fclose(fp5);
fclose(fp6);
fclose(fp7);
getch();
}
void orig_file(FILE *fp)
{
char al;
rewind(fp);
al=fgetc(fp);
while(al != EOF)
{
printf("%c",al);
al=fgetc(fp);
}
}
void punct_remove(FILE *fp,FILE *fp1)
{
char punct;
rewind(fp);
rewind(fp1);
while(!feof(fp))
{
punct = getc(fp);
if(punct!='.'&&punct!=','&&punct!='!'&&punct!='?'&&punct!=':'&&punct!
='\''&&punct!='"'&&punct!=';'&&punct!='-'&&punct!='('&&punct!=')'&&punct!
='['&&punct!=']')
{
fputc(punct,fp1);
}
else
{
fputc(' ',fp1);
} }
orig_file(fp1);
}
void freq_words_remove(FILE *fp1,FILE *fp2,FILE *fp3)
{
char dword[15], stopword[15];
int comp,i,flag=0;
fp1=fopen("Wo_punct.txt","r");
rewind(fp1);
rewind(fp2);
rewind(fp3);
fscanf(fp1,"%s",dword);
while(!feof(fp1))
{
rewind(fp2);
fscanf(fp2,"%s",stopword);
for(i=0;i<635;i++)
{
flag=0;
comp = stricmp(dword,stopword);
if(comp==0)
{
flag=1;
break;
}
fscanf(fp2,"%s",stopword);
}
if(flag==0)
{
fprintf(fp3,"%s",dword);
fputc(' ',fp3);
}
fscanf(fp1,"%s",dword);
}
orig_file(fp3);
}
void suffix_strip(FILE *fp3,FILE *fp4,FILE *fp5)
{
char c,sword[15]="\0",suffix[8]="\0",*comp=NULL;
char *revpos=NULL,newword[15]="\0";
int i,j,length,suf_length,sub_length=0;
fp3=fopen("Wo_freq.txt","r");
rewind(fp3);
rewind(fp4);
rewind(fp5);
fscanf(fp3,"%s",sword);
while(!feof(fp3))
{
rewind(fp4);
fscanf(fp4,"%s",suffix);
for(i=0;i<8;i++)
{
comp = strstr(sword,suffix);
if(comp!=NULL)
{
length = strlen(sword);
suf_length = strlen(suffix);
sub_length = strlen(comp);
c=suffix[0];
if(i==0 || i==1 || i==2 || i==3 || i==4 || i==5)
{
printf("Suffix '%s' is a substring of
'%s'\n\n",comp,sword);
if(length>=4)
{
revpos = strrchr(sword,c);
if(strlen(revpos)==suf_length)
{
j=0;
while(sword[j]!= *revpos)
{
newword[j] = sword[j];
j++;
}
newword[j]='\0';
fprintf(fp5,"%s",newword);
fputc(' ',fp5);
}
}
}
else if(i==6 && sub_length==1)
{
printf("Suffix '%s' is a substring of
'%s'\n\n",comp,sword);
if(length>=4)
{
revpos = strrchr(sword,c);
if(strlen(revpos)==suf_length)
{
j=0;
while(sword[j]!= c)
{
newword[j] = sword[j];
j++;
}
newword[j]='\0';
fprintf(fp5,"%s",newword);
fputc(' ',fp5);
}
}
}
else if(i==7)
{
printf("Suffix '%s' is a substring of
'%s'\n\n",comp,sword);
if(length>=4)
{
revpos = strrchr(sword,c);
if((strlen(revpos)==suf_length+1) ||
(strlen(revpos)==suf_length))
{
j=0;
while(sword[j] != c)
{
newword[j] = sword[j];
j++;
}
if(strlen(revpos)==suf_length+1)
{
newword[j]='l';
newword[++j]='\0';
}
if(strlen(revpos)==suf_length)
{
newword[j]='\0';
}
fprintf(fp5,"%s",newword);
fputc(' ',fp5);
}
}
}
else
{
fprintf(fp5,"%s",sword);
fputc(' ',fp5);
}
break;
}
fscanf(fp4,"%s",suffix);
}
if(comp==NULL)
{
fprintf(fp5,"%s",sword);
fputc(' ',fp5);
}
fscanf(fp3,"%s",sword);
}
orig_file(fp5);
}
void equi_stem(FILE *fp5,FILE *fp6)
{
char word1[15]="\0",nextword[15]="\0";
int comp,flag=0;
rewind(fp5);
rewind(fp6);
fscanf(fp5,"%s",word1);
fprintf(fp6,"%s",word1);
fputc(' ',fp6);
while(!feof(fp5))
{
flag=0;
fscanf(fp5,"%s",word1);
rewind(fp6);
fscanf(fp6,"%s",nextword);
while(!feof(fp6))
{
comp = stricmp(word1,nextword);
if(comp==0)
{
flag=0;
break;
}
if(comp!=0)
{
flag=1;
}
fscanf(fp6,"%s",nextword);
}
if(flag==1)
{
fprintf(fp6,"%s",word1);
fputc(' ',fp6);
}
}
printf("\n\t\t\tAfter Removing Repeated Words\n");
printf("\t\t\t-----------------------------\n");
orig_file(fp6); }
void stem(FILE *fp6,FILE *fp7)
{
char word1[15]="\0",rep_word[15]="happy";
printf("\n\n\t\t\tAfter Removing Equivalent Stems\n");
printf("\t\t\t-------------------------------\n");
rewind(fp6);
fscanf(fp6,"%s",word1);
while(!feof(fp6))
{
if(strstr(word1,"happiness"))
{
fprintf(fp7,"%s",rep_word);
fputc(' ',fp7);
}
else
{
fprintf(fp7,"%s",word1);
fputc(' ',fp7);
}
fscanf(fp6,"%s",word1);
}
orig_file(fp7);
}

You might also like