������Ѷ
���ೣ������
-
����ũ���������ֻ����пͻ���������������584204
-
Windows�Ż���ʦ��������416902
-
90��Ů��(��Ƶ��������)��������366961
-
����ũ���������ֻ����пͻ���������������365699
-
�첥�ֻ�����������325855
1.������
��������������Ŀ��Ҫ���������������Ĵ���ӰƬ������У�����������������������ʺ��û����飬�о���һ�������ı��Զ�����(רҵ�㽲��У��,proofread)��������ʵ���˸ù��ܣ��ش˼�¼��
2.������
������������������У����������ָ�����벻�������ߴ�������ʱϵͳ��ʾ�����������������Ӿ�����word������ʱ���к�ɫ�»�����ʾ��ʵ�ָù���Ŀǰ��Ҫ������˼·��
(1) ���ڴ����ֵ��ķִʷ�����Ҫ�ǽ��������ĺ��ִ���һ���ܴ��ġ������ʵ䡱�еĴ�������ƥ�䣬���ڴʵ����ҵ���ƥ���ɹ����÷�������ʵ�֣��Ƚ������������ĺ��ִ�
����ij����ij�������������ʻ����ƣ�
(2) ����ͳ����Ϣ�ķִʷ������õ���N-Gram����ģ�ͣ���ʵ����N-1��Markov(�����Ʒ�)ģ�ͣ��ڴ˼���һ�¸�ģ�ͣ�

��ʽ��Byes��ʽ�������ַ���X1X2����Xm���ֵĸ�����ÿ���ֵ������ֵ���������֮����Ϊ�˼���������Xi�ij��ֽ���ǰ�������ŵ�N-1���ַ��йأ��������Ĺ�ʽ��Ϊ��

������N-1��Markov(�����Ʒ�)ģ�ͣ����������ʺ���һ����ֵ�Աȣ���С�ڸ���ֵ����ʾ���ַ���ƴд������
3.ʵ����
���ڱ�����Ŀ���Ե����뺺�ִ���������Ӱ�Ӿ������Լ����ն�����Ŀ�����֣����Ͽ��ķ�Χ�����ȶ�Щ��������������2-Gram����Ԫ����ģ�����ֵ��ִ������ϵķ�����
��˵��˼·��
�����Ͽ����зִʴ��� ��> ������Ԫ�������ָ���(�����Ͽ��������£��ô������ֵ�Ƶ�ʴ���) ��> �Դ������ĺ��ִ��ִʲ��ҳ����������ַ����͵ڶ��������ַ��� ��>
���������͵ڶ��������ַ��������Ͽ���ӰƬ����ƥ�� ��> ����ƥ������ʵƴд�����ظ������ַ���(�����ֵ�����Ҫ)
��ע���ִ�������ICTCLAS Java API
�ϴ��룺
������ChineseWordProofread
3.1 ��ʼ���ִʰ�����ӰƬ���Ͽ����зִʴ���
1 public ICTCLAS2011 initWordSegmentation(){
2
3 ICTCLAS2011 wordSeg = new ICTCLAS2011();
4 try{
5 String argu = "F:\\Java\\workspace\\wordProofread"; //set your project path
6 System.out.println("ICTCLAS_Init");
7 if (ICTCLAS2011.ICTCLAS_Init(argu.getBytes("GB2312"),0) == false)
8 {
9 System.out.println("Init Fail!");
10 //return null;
11 }
12
13 /*
14 * ���ô��Ա�ע��
15 ID �������Լ�
16 1 ������һ����ע��
17 0 ������������ע��
18 2 ����������ע��
19 3 ����һ����ע��
20 */
21 wordSeg.ICTCLAS_SetPOSmap(2);
22
23 }catch (Exception ex){
24 System.out.println("words segmentation initialization failed");
25 System.exit(-1);
26 }
27 return wordSeg;
28 }
29
30 public boolean wordSegmentate(String argu1,String argu2){
31 boolean ictclasFileProcess = false;
32 try{
33 //�ļ��ִ�
34 ictclasFileProcess = wordSeg.ICTCLAS_FileProcess(argu1.getBytes("GB2312"), argu2.getBytes("GB2312"), 0);
35
36 //ICTCLAS2011.ICTCLAS_Exit();
37
38 }catch (Exception ex){
39 System.out.println("file process segmentation failed");
40 System.exit(-1);
41 }
42 return ictclasFileProcess;
43 }3.2 ��������(tokens)���ֵ�Ƶ��
1 public Map<String,Integer> calculateTokenCount(String afterWordSegFile){
2 Map<String,Integer> wordCountMap = new HashMap<String,Integer>();
3 File movieInfoFile = new File(afterWordSegFile);
4 BufferedReader movieBR = null;
5 try {
6 movieBR = new BufferedReader(new FileReader(movieInfoFile));
7 } catch (FileNotFoundException e) {
8 System.out.println("movie_result.txt file not found");
9 e.printStackTrace();
10 }
11
12 String wordsline = null;
13 try {
14 while ((wordsline=movieBR.readLine()) != null){
15 String[] words = wordsline.trim().split(" ");
16 for (int i=0;i<words.length;i++){
17 int wordCount = wordCountMap.get(words[i])==null ? 0:wordCountMap.get(words[i]);
18 wordCountMap.put(words[i], wordCount+1);
19 totalTokensCount += 1;
20
21 if (words.length > 1 && i < words.length-1){
22 StringBuffer wordStrBuf = new StringBuffer();
23 wordStrBuf.append(words[i]).append(words[i+1]);
24 int wordStrCount = wordCountMap.get(wordStrBuf.toString())==null ? 0:wordCountMap.get(wordStrBuf.toString());
25 wordCountMap.put(wordStrBuf.toString(), wordStrCount+1);
26 totalTokensCount += 1;
27 }
28
29 }
30 }
31 } catch (IOException e) {
32 System.out.println("read movie_result.txt file failed");
33 e.printStackTrace();
34 }
35
36 return wordCountMap;
37 }3.3 �ҳ��������ַ����е���ȷtokens
1 public Map<String,Integer> calculateTokenCount(String afterWordSegFile){
2 Map<String,Integer> wordCountMap = new HashMap<String,Integer>();
3 File movieInfoFile = new File(afterWordSegFile);
4 BufferedReader movieBR = null;
5 try {
6 movieBR = new BufferedReader(new FileReader(movieInfoFile));
7 } catch (FileNotFoundException e) {
8 System.out.println("movie_result.txt file not found");
9 e.printStackTrace();
10 }
11
12 String wordsline = null;
13 try {
14 while ((wordsline=movieBR.readLine()) != null){
15 String[] words = wordsline.trim().split(" ");
16 for (int i=0;i<words.length;i++){
17 int wordCount = wordCountMap.get(words[i])==null ? 0:wordCountMap.get(words[i]);
18 wordCountMap.put(words[i], wordCount+1);
19 totalTokensCount += 1;
20
21 if (words.length > 1 && i < words.length-1){
22 StringBuffer wordStrBuf = new StringBuffer();
23 wordStrBuf.append(words[i]).append(words[i+1]);
24 int wordStrCount = wordCountMap.get(wordStrBuf.toString())==null ? 0:wordCountMap.get(wordStrBuf.toString());
25 wordCountMap.put(wordStrBuf.toString(), wordStrCount+1);
26 totalTokensCount += 1;
27 }
28
29 }
30 }
31 } catch (IOException e) {
32 System.out.println("read movie_result.txt file failed");
33 e.printStackTrace();
34 }
35
36 return wordCountMap;
37 }
3.4 �õ����������͵ڶ��������ַ���(Ҳ����Ϊ�����ַ�)
1 public String[] getMaxAndSecondMaxSequnce(String[] sInputResult){
2 List<String> correctTokens = getCorrectTokens(sInputResult);
3 //TODO
4 System.out.println(correctTokens);
5 String[] maxAndSecondMaxSeq = new String[2];
6 if (correctTokens.size() == 0) return null;
7 else if (correctTokens.size() == 1){
8 maxAndSecondMaxSeq[0]=correctTokens.get(0);
9 maxAndSecondMaxSeq[1]=correctTokens.get(0);
10 return maxAndSecondMaxSeq;
11 }
12
13 String maxSequence = correctTokens.get(0);
14 String maxSequence2 = correctTokens.get(correctTokens.size()-1);
15 String littleword = "";
16 for (int i=1;i<correctTokens.size();i++){
17 //System.out.println(correctTokens);
18 if (correctTokens.get(i).length() > maxSequence.length()){
19 maxSequence = correctTokens.get(i);
20 } else if (correctTokens.get(i).length() == maxSequence.length()){
21
22 //select the word with greater probability for single-word
23 if (correctTokens.get(i).length()==1){
24 if (probBetweenTowTokens(correctTokens.get(i)) > probBetweenTowTokens(maxSequence)) {
25 maxSequence2 = correctTokens.get(i);
26 }
27 }
28 //select words with smaller probability for multi-word, because the smaller has more self information
29 else if (correctTokens.get(i).length()>1){
30 if (probBetweenTowTokens(correctTokens.get(i)) <= probBetweenTowTokens(maxSequence)) {
31 maxSequence2 = correctTokens.get(i);
32 }
33 }
34
35 } else if (correctTokens.get(i).length() > maxSequence2.length()){
36 maxSequence2 = correctTokens.get(i);
37 } else if (correctTokens.get(i).length() == maxSequence2.length()){
38 if (probBetweenTowTokens(correctTokens.get(i)) > probBetweenTowTokens(maxSequence2)){
39 maxSequence2 = correctTokens.get(i);
40 }
41 }
42 }
43 //TODO
44 System.out.println(maxSequence+" : "+maxSequence2);
45 //delete the sub-word from a string
46 if (maxSequence2.length() == maxSequence.length()){
47 int maxseqvaluableTokens = maxSequence.length();
48 int maxseq2valuableTokens = maxSequence2.length();
49 float min_truncate_prob_a = 0 ;
50 float min_truncate_prob_b = 0;
51 String aword = "";
52 String bword = "";
53 for (int i=0;i<correctTokens.size();i++){
54 float tokenprob = probBetweenTowTokens(correctTokens.get(i));
55 if ((!maxSequence.equals(correctTokens.get(i))) && maxSequence.contains(correctTokens.get(i))){
56 if ( tokenprob >= min_truncate_prob_a){
57 min_truncate_prob_a = tokenprob ;
58 aword = correctTokens.get(i);
59 }
60 }
61 else if ((!maxSequence2.equals(correctTokens.get(i))) && maxSequence2.contains(correctTokens.get(i))){
62 if (tokenprob >= min_truncate_prob_b){
63 min_truncate_prob_b = tokenprob;
64 bword = correctTokens.get(i);
65 }
66 }
67 }
68 //TODO
69 System.out.println(aword+" VS "+bword);
70 System.out.println(min_truncate_prob_a+" VS "+min_truncate_prob_b);
71 if (aword.length()>0 && min_truncate_prob_a < min_truncate_prob_b){
72 maxseqvaluableTokens -= 1 ;
73 littleword = maxSequence.replace(aword,"");
74 }else {
75 maxseq2valuableTokens -= 1 ;
76 String temp = maxSequence2;
77 if (maxSequence.contains(temp.replace(bword, ""))){
78 littleword = maxSequence2;
79 }
80 else littleword = maxSequence2.replace(bword,"");
81
82 }
83
84 if (maxseqvaluableTokens < maxseq2valuableTokens){
85 maxSequence = maxSequence2;
86 maxSequence2 = littleword;
87 }else {
88 maxSequence2 = littleword;
89 }
90
91 }
92 maxAndSecondMaxSeq[0] = maxSequence;
93 maxAndSecondMaxSeq[1] = maxSequence2;
94
95 return maxAndSecondMaxSeq ;
96 }3.5 ���ظ����б�
1 public List<String> proofreadAndSuggest(String sInput){
2 //List<String> correctTokens = new ArrayList<String>();
3 List<String> correctedList = new ArrayList<String>();
4 List<String> crtTempList = new ArrayList<String>();
5
6 //TODO
7 Calendar startProcess = Calendar.getInstance();
8 char[] str2char = sInput.toCharArray();
9 String[] sInputResult = new String[str2char.length];//cwp.wordSegmentate(sInput);
10 for (int t=0;t<str2char.length;t++){
11 sInputResult[t] = String.valueOf(str2char[t]);
12 }
13 //String[] sInputResult = cwp.wordSegmentate(sInput);
14 //System.out.println(sInputResult);
15 //float re = probBetweenTowTokens("��","��");
16 String[] MaxAndSecondMaxSequnce = getMaxAndSecondMaxSequnce(sInputResult);
17
18 // display errors and suggest correct movie name
19 //System.out.println("hasError="+hasError);
20 if (hasError !=0){
21 if (MaxAndSecondMaxSequnce.length>1){
22 String maxSequence = MaxAndSecondMaxSequnce[0];
23 String maxSequence2 = MaxAndSecondMaxSequnce[1];
24 for (int j=0;j<movieName.size();j++){
25 //boolean isThisMovie = false;
26 String movie = movieName.get(j);
27
28
29 //System.out.println("maxseq is "+maxSequence+", maxseq2 is "+maxSequence2);
30
31 //select movie
32 if (maxSequence2.equals("")){
33 if (movie.contains(maxSequence)) correctedList.add(movie);
34 }
35 else {
36 if (movie.contains(maxSequence) && movie.contains(maxSequence2)){
37 //correctedList.clear();
38 crtTempList.add(movie);
39 //correctedList.add(movie);
40 //break;
41 }
42 //else if (movie.contains(maxSequence) || movie.contains(maxSequence2)) correctedList.add(movie);
43 else if (movie.contains(maxSequence)) correctedList.add(movie);
44 }
45
46 }
47
48 if (crtTempList.size()>0){
49 correctedList.clear();
50 correctedList.addAll(crtTempList);
51 }
52
53 //TODO��
54 if (hasError ==1) System.out.println("No spellig error,Sorry for having no this movie,do you want to get :"+correctedList.toString()+" ?");
55 //TODO
56 else System.out.println("Spellig error,do you want to get :"+correctedList.toString()+" ?");
57 } //TODO
58 else System.out.println("there are spellig errors, no anyone correct token in your spelled words,so I can't guess what you want, please check it again");
59
60 } //TODO
61 else System.out.println("No spelling error");
62
63 //TODO
64 Calendar endProcess = Calendar.getInstance();
65 long elapsetime = (endProcess.getTimeInMillis()-startProcess.getTimeInMillis()) ;
66 System.out.println("process work elapsed "+elapsetime+" ms");
67 ICTCLAS2011.ICTCLAS_Exit();
68
69 return correctedList ;
70 }3.6 ��ʾУ�Խ���
1 public static void main(String[] args) {
2
3 String argu1 = "movie.txt"; //movies name file
4 String argu2 = "movie_result.txt"; //words after segmenting name of all movies
5
6 SimpleDateFormat sdf=new SimpleDateFormat("HH:mm:ss");
7 String startInitTime = sdf.format(new java.util.Date());
8 System.out.println(startInitTime+" ---start initializing work---");
9 ChineseWordProofread cwp = new ChineseWordProofread(argu1,argu2);
10
11 String endInitTime = sdf.format(new java.util.Date());
12 System.out.println(endInitTime+" ---end initializing work---");
13
14 Scanner scanner = new Scanner(System.in);
15 while(true){
16 System.out.print("������ӰƬ����");
17
18 String input = scanner.next();
19
20 if (input.equals("EXIT")) break;
21
22 cwp.proofreadAndSuggest(input);
23
24 }
25 scanner.close();
26 }���ҵĻ�����ʵ���������£�

����Ҫ˵�������õ����Ͽ�û����̫�ദ�������������������кܶ���ȷ�Ľ����������dz����Ż��С��dz�����ʮ���ºϼ����ȣ���ЩֻҪ��ӰƬ���Ͽ��ϴ����¼��ɣ�
���о��Ǹ�ģ�Ͳ��ʺϴ���ģ�������ݣ�����˵���������е��Զ�У�����߽�������ʾ����ʹ��Ӱ�Ӿ硢���������յ�ӰƬ���Զ��������������ϱ�ģ�ͻ��кܶ������ĵط�����������ϧ���̣������������뷨������֪���������ǿ�Դ�����š����ģ�����Դ����github�ϣ������Լ�����ZIP���غ���ѹ����eclipse�д�������wordproofread������ѹ�����������ļ�copy���ù����£��������С�

DNFÿ��ǩ���ͺ�
lol7�������̵�
LOL�����Ҷ���ô
LOLӢ�۳ɾͱ�־
�����г�����ȷ
��У��ѧ������
������Ȧ��ô