利用page rank,hits算法实现的一个简单的文本摘要系统
/Files/rocketfan/Language_Independent_Extractive_Summarization.pdf
/Files/rocketfan/Summarization_readme.pdf
Well,当前仅仅做到将文本中的句子按重要程度排序,可进一步操作生成摘要,仅仅用于英文文本。
仅以该随笔纪念我的第一个JAVA程序:)
//usage
//javac TestSummarizer.java
//java TestSummarizer <inputFile> <outputFile> #output file 中将会对句子按重要程度排好序
1 /**
2 * The class for getting user command and
3 * parse the command to set up aprropritae
4 * Sumarizer who will finish the job of ranking the
5 * importance of all the input file sentences ,
6 * sort sentences based on that and then
7 * write to the output file.
8 */
9 public class TestSummarizer {
10
11 public static void main(String[] args) {
12
13 String rankMethod = args[0];
14 String inputFile = args[1];
15 String outputFile = args[2];
16 //TODO warning not supported method
17 Ranker ranker;
18 if (rankMethod.equals("-pagerank")) {
19 ranker = new PageRankRanker();
20 } else if (rankMethod.equals("-hits")) {
21 ranker = new HitsRanker();
22 } else {
23 System.out.println("Not -pagerank or -hits use -pagerank");
24 ranker = new PageRankRanker();
25 }
26 Summarizer summarizer = new Summarizer(inputFile, outputFile, ranker);
27 summarizer.fileSummarize();
28 }
29 } //end of class TestSumarizer
2 * The class for getting user command and
3 * parse the command to set up aprropritae
4 * Sumarizer who will finish the job of ranking the
5 * importance of all the input file sentences ,
6 * sort sentences based on that and then
7 * write to the output file.
8 */
9 public class TestSummarizer {
10
11 public static void main(String[] args) {
12
13 String rankMethod = args[0];
14 String inputFile = args[1];
15 String outputFile = args[2];
16 //TODO warning not supported method
17 Ranker ranker;
18 if (rankMethod.equals("-pagerank")) {
19 ranker = new PageRankRanker();
20 } else if (rankMethod.equals("-hits")) {
21 ranker = new HitsRanker();
22 } else {
23 System.out.println("Not -pagerank or -hits use -pagerank");
24 ranker = new PageRankRanker();
25 }
26 Summarizer summarizer = new Summarizer(inputFile, outputFile, ranker);
27 summarizer.fileSummarize();
28 }
29 } //end of class TestSumarizer
//Summarizer.java是核心部分,通过建立inverted word index 计算出所有句子对的overlap
1 //import java.util.regex.*;
2
3 import java.util.*;
4
5 import java.io.*;
6
7 import java.text.*;
8
9 //to do how to split also consider ?
10
11
12
13 /**
14
15 * Summarizer is on charge of caculating
16
17 * a Matrix A, with A[i][j] represents
18
19 * the overlap of sentence i and j.
20
21 * It will than send the Matrix A to a
22
23 * Ranker who will caculate all the sentence's
24
25 * importance value and send back to Summarizer.
26
27 * Summarizer will sort sentences based on
28
29 * their importance value and write the reult
30
31 * to file.
32
33 *
34
35 * Summarizer is also on charge of spliting doucument
36
37 * content to sentences and split each sentence to
38
39 * words.
40
41 */
42
43 public class Summarizer {
44
45
46
47 /**
48
49 * Define Sentence type, it will record the sentence's
50
51 * start and end postion in fileContentString.
52
53 * Also will record the word number in this sentence.
54
55 */
56
57 private class Sentence {
58
59 public int start;
60
61 public int end;
62
63 public int wordNum;
64
65 }
66
67
68
69 /** Store which rank aloritm to use*/
70
71 private final Ranker ranker;
72
73 /** Store the name of output file*/
74
75 private final String outputFileName;
76
77 /** The whole file content is stored as a string here*/
78
79 private final String fileContentString;
80
81 /** sotre all the sentences info*/
82
83 private final List<Sentence> sentenceList = new ArrayList<Sentence> ();;
84
85 /** the dfault stop words we will use*/
86
87 private final static String stopWords[] =
88
89 {"a", "about", "an", "and", "are", "as", "at", "be", "but", "by",
90
91 "for", "has", "have", "he", "his", "her", "in", "is", "it", "its",
92
93 "of", "on", "or", "she", "that", "the", "their", "they", "this",
94
95 "to", "was", "which", "who", "will", "with", "you"};
96
97 private final Map<String, String> hmStopWord = new HashMap<String, String> ();
98
99
100
101 /**
102
103 * Construct of a summarizer
104
105 * Client must provide the inputFileName to sumarize
106
107 * and the aimed outputFileName to output,
108
109 * Clent also has to choose wich ranker algorithm to use,eg.
110
111 * Summarizer will also get stop words hashing map table set up.
112
113 */
114
115 public Summarizer(String inputFileName, String outputFileName, Ranker ranker) {
116
117 this.outputFileName = outputFileName;
118
119 this.fileContentString = readFileToString(inputFileName);
120
121 this.ranker = ranker;
122
123 initStopWordsMap();
124
125 }
126
127 /**
128
129 * The only interface user can see currently,
130
131 * to sumarize a given file using ranker provided rank algorithm
132
133 */
134
135 public void fileSummarize() {
136
137 initSentenceList(); //first spilit doucument to sentences using BreakIterator
138
139 double[][] A = getSetencesWeightMatrix(); //caculate the wight matrix using overlap score of sentences pair
140
141 double[]P = ranker.getRankVec(A); //caculate each sentence importance value
142
143 sortAndWriteFile(P); //sort and ouput the sorted sentences
144
145 }
146
147
148
149 //below are all internal private functions
150
151
152
153 /**
154
155 * Caculate the total sentece num and store each sentence
156
157 * info to sentencesList(start, end)
158
159 */
160
161 private void initSentenceList() {
162
163
164
165 Locale currentLocale = new Locale("en", "US");
166
167 BreakIterator sentenceIter = BreakIterator.getSentenceInstance(currentLocale);
168
169 sentenceIter.setText(fileContentString);
170
171
172
173 int start = sentenceIter.first();
174
175 int end = sentenceIter.next();
176
177 while (end != sentenceIter.DONE) {
178
179 Sentence sentence = new Sentence();
180
181 sentence.start = start;
182
183 if (fileContentString.charAt(end - 1) == '\n') //we will eliminate the \n if a sentece has
184
185 sentence.end = end - 1; //[)
186
187 else
188
189 sentence.end = end;
190
191 sentenceList.add(sentence);
192
193 start = end;
194
195 end = sentenceIter.next();
196
197 }
198
199 }
200
201
202
203 /**
204
205 * Init of stop words hash map
206
207 */
208
209 private void initStopWordsMap() {
210
211 for (int i = 0; i < stopWords.length; i++)
212
213 hmStopWord.put(stopWords[i], null);
214
215 }
216
217
218
219 /**
220
221 * This is the key function of Summarizer
222
223 * It will caculate the overlap value
224
225 * for each pair of sentences
226
227 */
228
229 private double[][] getSetencesWeightMatrix( ) {
230
231
232
233 Map<String, List<Integer> > hm = new HashMap<String, List<Integer> >();
234
235 int sentenceNum = sentenceList.size();
236
237 double[][] A = new double[sentenceNum][sentenceNum];
238
239 //deal with all the words in each sentence
240
241 Locale currentLocale = new Locale("en", "US");
242
243 BreakIterator wordIter = BreakIterator.getWordInstance(currentLocale);
244
245
246
247 //we do the process twice one forward and one reverse
248
249 //only for the case of repated words in one sentence
250
251 //we suppose overlap(i, j) = overlap(j, i)
252
253 //if setentce i is "above above ok" sentence j is "above above wrong"
254
255 //the overlap is 2 , if sentence j is "above wrong" the overlap is 1.
256
257 for (int i = 0; i < sentenceNum; i++) {
258
259 Sentence sentence = sentenceList.get(i);
260
261 wordIter.setText(fileContentString.substring(sentence.start, sentence.end));
262
263
264
265 int start = wordIter.first();
266
267 int end = wordIter.next();
268
269 int wordNum = 0;
270
271 while (end != wordIter.DONE) {
272
273 String word = fileContentString.substring(sentence.start + start, sentence.start + end);
274
275 if (Character.isLetterOrDigit(word.charAt(0))) {
276
277 wordNum++; //ok, find legal word
278
279
280
281 String token = parseString(word);
282
283 if (hmStopWord.containsKey(token)) {
284
285 start = end;
286
287 end = wordIter.next();
288
289 continue;
290
291 }
292
293 List<Integer> list;
294
295 if (hm.containsKey(token)) {
296
297 list = hm.get(token);
298
299 int flag = 0;
300
301 for (int j : list) {
302
303 if (i > j)
304
305 A[i][j] += 1.0;
306
307 else
308
309 flag = 1;
310
311 }
312
313 if (flag == 0)
314
315 list.add(i);
316
317 } else {
318
319 list = new LinkedList<Integer> ();
320
321 list.add(i);
322
323 hm.put(token, list);
324
325 }
326
327 }
328
329 start = end;
330
331 end = wordIter.next();
332
333 }
334
335 sentence.wordNum = wordNum;
336
337 }
338
339
340
341 hm.clear();
342
343
344
345 for (int i = sentenceNum - 1; i >= 0; i--) {
346
347 Sentence sentence = sentenceList.get(i);
348
349 wordIter.setText(fileContentString.substring(sentence.start, sentence.end));
350
351
352
353 int start = wordIter.first();
354
355 int end = wordIter.next();
356
357 while (end != wordIter.DONE) {
358
359 String word = fileContentString.substring(sentence.start + start, sentence.start + end);
360
361 if (Character.isLetterOrDigit(word.charAt(0))) {
362
363 //ok,find a legal word
364
365 String token = parseString(word);
366
367 if (hmStopWord.containsKey(token)) {
368
369 start = end;
370
371 end = wordIter.next();
372
373 continue;
374
375 }
376
377 List<Integer> list;
378
379 if (hm.containsKey(token)) {
380
381 list = hm.get(token);
382
383 int flag = 0;
384
385 for (int j : list) {
386
387 if (i < j)
388
389 A[i][j] += 1.0;
390
391 else
392
393 flag = 1;
394
395 }
396
397 if (flag == 0)
398
399 list.add(i);
400
401
402
403 } else {
404
405 list = new LinkedList<Integer> ();
406
407 list.add(i);
408
409 hm.put(token, list);
410
411 }
412
413 }
414
415 start = end;
416
417 end = wordIter.next();
418
419 }
420
421 }
422
423
424
425 //make sure overlap(i, j) = overlap(j, i) when not considering length
426
427 for (int i = 0; i < sentenceNum; i++) {
428
429 for (int j = 0; j > i && j < sentenceNum; j++) {
430
431 if (A[i][j] > A[j][i])
432
433 A[i][j] = A[j][i];
434
435 }
436
437 }
438
439
440
441 //divide by sentence lenght to avoid promoting long sentences
442
443 for (int i = 0; i < sentenceNum; i++) {
444
445 for (int j = 0; j < sentenceNum; j++) {
446
447 if (A[i][j] > 0 && sentenceList.get(i).wordNum > 0)
448
449 A[i][j] /= sentenceList.get(i).wordNum;
450
451 }
452
453 }
454
455
456
457 return A;
458
459 }
460
461
462
463 /** Read the whole input file content*/
464
465 private static String readFileToString(String infile) {
466
467 try {
468
469 BufferedReader reader = new BufferedReader(new FileReader(infile));
470
471 String result ="";
472
473 String line = reader.readLine();
474
475 while (line != null) {
476
477 result = result + line + '\n';
478
479 line = reader.readLine();
480
481 }
482
483 reader.close();
484
485 return result;
486
487 } catch(IOException ex) {
488
489 System.out.println(ex); //handle file not found by displaying message
490
491 return ""; //return the empty string if file not fount
492
493 }
494
495 }
496
497
498
499 /**
500
501 * For each word we will consider it as lowercase
502
503 * and delete any non letter character.
504
505 */
506
507 private static String parseString(String s) {
508
509 return s.toLowerCase().replaceAll("\\W", "");
510
511 }
512
513
514
515 /**
516
517 * Print the sentence which has index i
518
519 */
520
521 private String getSentenceString(int i) {
522
523 Sentence sentence = sentenceList.get(i);
524
525 return fileContentString.substring(sentence.start, sentence.end);
526
527 }
528
529
530
531 /**
532
533 * After running rank algorithm, sort the result and output the sentences based
534
535 * on their score
536
537 */
538
539 class CollatorComparator implements Comparator<Double> {
540
541 public int compare(Double element1, Double element2) {
542
543 double key1 = element1.doubleValue();
544
545 double key2 = element2.doubleValue();
546
547 if (key1 == key2)
548
549 return 1;
550
551 if (key1 > key2)
552
553 return -1;
554
555 else
556
557 return 1;
558
559 }
560
561 }
562
563
564
565 private void sortAndWriteFile(double[] P) {
566
567 try {
568
569 CollatorComparator comparator = new CollatorComparator();
570
571 Map<Double,Integer> map = new TreeMap<Double, Integer>(comparator);
572
573
574
575 for (int i = 0; i < P.length; i++)
576
577 map.put(P[i], i);
578
579
580
581 Set entries = map.entrySet();//得到关键字/值对的集合
582
583 Iterator iter = entries.iterator();
584
585 PrintWriter writer = new PrintWriter(new BufferedWriter(new FileWriter(outputFileName)));
586
587 DecimalFormat df = new DecimalFormat("#0.00000");
588
589 while(iter.hasNext()) {
590
591 Map.Entry entry = (Map.Entry)iter.next();
592
593 double keyWeight = (Double)entry.getKey();
594
595 int valueIndex = (Integer)entry.getValue();
596
597 writer.write("(");
598
599 writer.write(df.format(keyWeight));
600
601 writer.write(") ");
602
603 writer.write(getSentenceString(valueIndex) + "\n");
604
605 }
606
607 writer.close();
608
609 } catch (IOException ex) {
610
611 System.out.println(ex); //handle file not found by displaying message
612
613 }
614
615 }
616
617
618 } // end of class Sumarizer
619
2
3 import java.util.*;
4
5 import java.io.*;
6
7 import java.text.*;
8
9 //to do how to split also consider ?
10
11
12
13 /**
14
15 * Summarizer is on charge of caculating
16
17 * a Matrix A, with A[i][j] represents
18
19 * the overlap of sentence i and j.
20
21 * It will than send the Matrix A to a
22
23 * Ranker who will caculate all the sentence's
24
25 * importance value and send back to Summarizer.
26
27 * Summarizer will sort sentences based on
28
29 * their importance value and write the reult
30
31 * to file.
32
33 *
34
35 * Summarizer is also on charge of spliting doucument
36
37 * content to sentences and split each sentence to
38
39 * words.
40
41 */
42
43 public class Summarizer {
44
45
46
47 /**
48
49 * Define Sentence type, it will record the sentence's
50
51 * start and end postion in fileContentString.
52
53 * Also will record the word number in this sentence.
54
55 */
56
57 private class Sentence {
58
59 public int start;
60
61 public int end;
62
63 public int wordNum;
64
65 }
66
67
68
69 /** Store which rank aloritm to use*/
70
71 private final Ranker ranker;
72
73 /** Store the name of output file*/
74
75 private final String outputFileName;
76
77 /** The whole file content is stored as a string here*/
78
79 private final String fileContentString;
80
81 /** sotre all the sentences info*/
82
83 private final List<Sentence> sentenceList = new ArrayList<Sentence> ();;
84
85 /** the dfault stop words we will use*/
86
87 private final static String stopWords[] =
88
89 {"a", "about", "an", "and", "are", "as", "at", "be", "but", "by",
90
91 "for", "has", "have", "he", "his", "her", "in", "is", "it", "its",
92
93 "of", "on", "or", "she", "that", "the", "their", "they", "this",
94
95 "to", "was", "which", "who", "will", "with", "you"};
96
97 private final Map<String, String> hmStopWord = new HashMap<String, String> ();
98
99
100
101 /**
102
103 * Construct of a summarizer
104
105 * Client must provide the inputFileName to sumarize
106
107 * and the aimed outputFileName to output,
108
109 * Clent also has to choose wich ranker algorithm to use,eg.
110
111 * Summarizer will also get stop words hashing map table set up.
112
113 */
114
115 public Summarizer(String inputFileName, String outputFileName, Ranker ranker) {
116
117 this.outputFileName = outputFileName;
118
119 this.fileContentString = readFileToString(inputFileName);
120
121 this.ranker = ranker;
122
123 initStopWordsMap();
124
125 }
126
127 /**
128
129 * The only interface user can see currently,
130
131 * to sumarize a given file using ranker provided rank algorithm
132
133 */
134
135 public void fileSummarize() {
136
137 initSentenceList(); //first spilit doucument to sentences using BreakIterator
138
139 double[][] A = getSetencesWeightMatrix(); //caculate the wight matrix using overlap score of sentences pair
140
141 double[]P = ranker.getRankVec(A); //caculate each sentence importance value
142
143 sortAndWriteFile(P); //sort and ouput the sorted sentences
144
145 }
146
147
148
149 //below are all internal private functions
150
151
152
153 /**
154
155 * Caculate the total sentece num and store each sentence
156
157 * info to sentencesList(start, end)
158
159 */
160
161 private void initSentenceList() {
162
163
164
165 Locale currentLocale = new Locale("en", "US");
166
167 BreakIterator sentenceIter = BreakIterator.getSentenceInstance(currentLocale);
168
169 sentenceIter.setText(fileContentString);
170
171
172
173 int start = sentenceIter.first();
174
175 int end = sentenceIter.next();
176
177 while (end != sentenceIter.DONE) {
178
179 Sentence sentence = new Sentence();
180
181 sentence.start = start;
182
183 if (fileContentString.charAt(end - 1) == '\n') //we will eliminate the \n if a sentece has
184
185 sentence.end = end - 1; //[)
186
187 else
188
189 sentence.end = end;
190
191 sentenceList.add(sentence);
192
193 start = end;
194
195 end = sentenceIter.next();
196
197 }
198
199 }
200
201
202
203 /**
204
205 * Init of stop words hash map
206
207 */
208
209 private void initStopWordsMap() {
210
211 for (int i = 0; i < stopWords.length; i++)
212
213 hmStopWord.put(stopWords[i], null);
214
215 }
216
217
218
219 /**
220
221 * This is the key function of Summarizer
222
223 * It will caculate the overlap value
224
225 * for each pair of sentences
226
227 */
228
229 private double[][] getSetencesWeightMatrix( ) {
230
231
232
233 Map<String, List<Integer> > hm = new HashMap<String, List<Integer> >();
234
235 int sentenceNum = sentenceList.size();
236
237 double[][] A = new double[sentenceNum][sentenceNum];
238
239 //deal with all the words in each sentence
240
241 Locale currentLocale = new Locale("en", "US");
242
243 BreakIterator wordIter = BreakIterator.getWordInstance(currentLocale);
244
245
246
247 //we do the process twice one forward and one reverse
248
249 //only for the case of repated words in one sentence
250
251 //we suppose overlap(i, j) = overlap(j, i)
252
253 //if setentce i is "above above ok" sentence j is "above above wrong"
254
255 //the overlap is 2 , if sentence j is "above wrong" the overlap is 1.
256
257 for (int i = 0; i < sentenceNum; i++) {
258
259 Sentence sentence = sentenceList.get(i);
260
261 wordIter.setText(fileContentString.substring(sentence.start, sentence.end));
262
263
264
265 int start = wordIter.first();
266
267 int end = wordIter.next();
268
269 int wordNum = 0;
270
271 while (end != wordIter.DONE) {
272
273 String word = fileContentString.substring(sentence.start + start, sentence.start + end);
274
275 if (Character.isLetterOrDigit(word.charAt(0))) {
276
277 wordNum++; //ok, find legal word
278
279
280
281 String token = parseString(word);
282
283 if (hmStopWord.containsKey(token)) {
284
285 start = end;
286
287 end = wordIter.next();
288
289 continue;
290
291 }
292
293 List<Integer> list;
294
295 if (hm.containsKey(token)) {
296
297 list = hm.get(token);
298
299 int flag = 0;
300
301 for (int j : list) {
302
303 if (i > j)
304
305 A[i][j] += 1.0;
306
307 else
308
309 flag = 1;
310
311 }
312
313 if (flag == 0)
314
315 list.add(i);
316
317 } else {
318
319 list = new LinkedList<Integer> ();
320
321 list.add(i);
322
323 hm.put(token, list);
324
325 }
326
327 }
328
329 start = end;
330
331 end = wordIter.next();
332
333 }
334
335 sentence.wordNum = wordNum;
336
337 }
338
339
340
341 hm.clear();
342
343
344
345 for (int i = sentenceNum - 1; i >= 0; i--) {
346
347 Sentence sentence = sentenceList.get(i);
348
349 wordIter.setText(fileContentString.substring(sentence.start, sentence.end));
350
351
352
353 int start = wordIter.first();
354
355 int end = wordIter.next();
356
357 while (end != wordIter.DONE) {
358
359 String word = fileContentString.substring(sentence.start + start, sentence.start + end);
360
361 if (Character.isLetterOrDigit(word.charAt(0))) {
362
363 //ok,find a legal word
364
365 String token = parseString(word);
366
367 if (hmStopWord.containsKey(token)) {
368
369 start = end;
370
371 end = wordIter.next();
372
373 continue;
374
375 }
376
377 List<Integer> list;
378
379 if (hm.containsKey(token)) {
380
381 list = hm.get(token);
382
383 int flag = 0;
384
385 for (int j : list) {
386
387 if (i < j)
388
389 A[i][j] += 1.0;
390
391 else
392
393 flag = 1;
394
395 }
396
397 if (flag == 0)
398
399 list.add(i);
400
401
402
403 } else {
404
405 list = new LinkedList<Integer> ();
406
407 list.add(i);
408
409 hm.put(token, list);
410
411 }
412
413 }
414
415 start = end;
416
417 end = wordIter.next();
418
419 }
420
421 }
422
423
424
425 //make sure overlap(i, j) = overlap(j, i) when not considering length
426
427 for (int i = 0; i < sentenceNum; i++) {
428
429 for (int j = 0; j > i && j < sentenceNum; j++) {
430
431 if (A[i][j] > A[j][i])
432
433 A[i][j] = A[j][i];
434
435 }
436
437 }
438
439
440
441 //divide by sentence lenght to avoid promoting long sentences
442
443 for (int i = 0; i < sentenceNum; i++) {
444
445 for (int j = 0; j < sentenceNum; j++) {
446
447 if (A[i][j] > 0 && sentenceList.get(i).wordNum > 0)
448
449 A[i][j] /= sentenceList.get(i).wordNum;
450
451 }
452
453 }
454
455
456
457 return A;
458
459 }
460
461
462
463 /** Read the whole input file content*/
464
465 private static String readFileToString(String infile) {
466
467 try {
468
469 BufferedReader reader = new BufferedReader(new FileReader(infile));
470
471 String result ="";
472
473 String line = reader.readLine();
474
475 while (line != null) {
476
477 result = result + line + '\n';
478
479 line = reader.readLine();
480
481 }
482
483 reader.close();
484
485 return result;
486
487 } catch(IOException ex) {
488
489 System.out.println(ex); //handle file not found by displaying message
490
491 return ""; //return the empty string if file not fount
492
493 }
494
495 }
496
497
498
499 /**
500
501 * For each word we will consider it as lowercase
502
503 * and delete any non letter character.
504
505 */
506
507 private static String parseString(String s) {
508
509 return s.toLowerCase().replaceAll("\\W", "");
510
511 }
512
513
514
515 /**
516
517 * Print the sentence which has index i
518
519 */
520
521 private String getSentenceString(int i) {
522
523 Sentence sentence = sentenceList.get(i);
524
525 return fileContentString.substring(sentence.start, sentence.end);
526
527 }
528
529
530
531 /**
532
533 * After running rank algorithm, sort the result and output the sentences based
534
535 * on their score
536
537 */
538
539 class CollatorComparator implements Comparator<Double> {
540
541 public int compare(Double element1, Double element2) {
542
543 double key1 = element1.doubleValue();
544
545 double key2 = element2.doubleValue();
546
547 if (key1 == key2)
548
549 return 1;
550
551 if (key1 > key2)
552
553 return -1;
554
555 else
556
557 return 1;
558
559 }
560
561 }
562
563
564
565 private void sortAndWriteFile(double[] P) {
566
567 try {
568
569 CollatorComparator comparator = new CollatorComparator();
570
571 Map<Double,Integer> map = new TreeMap<Double, Integer>(comparator);
572
573
574
575 for (int i = 0; i < P.length; i++)
576
577 map.put(P[i], i);
578
579
580
581 Set entries = map.entrySet();//得到关键字/值对的集合
582
583 Iterator iter = entries.iterator();
584
585 PrintWriter writer = new PrintWriter(new BufferedWriter(new FileWriter(outputFileName)));
586
587 DecimalFormat df = new DecimalFormat("#0.00000");
588
589 while(iter.hasNext()) {
590
591 Map.Entry entry = (Map.Entry)iter.next();
592
593 double keyWeight = (Double)entry.getKey();
594
595 int valueIndex = (Integer)entry.getValue();
596
597 writer.write("(");
598
599 writer.write(df.format(keyWeight));
600
601 writer.write(") ");
602
603 writer.write(getSentenceString(valueIndex) + "\n");
604
605 }
606
607 writer.close();
608
609 } catch (IOException ex) {
610
611 System.out.println(ex); //handle file not found by displaying message
612
613 }
614
615 }
616
617
618 } // end of class Sumarizer
619
//MatrixHelp.java 内部使用的一些矩阵操作相关函数的实现
1 /**
2 * This class has all the independent caculations related to matrix and vector
3 * that will be used in text sumarization.
4 * Notice those fuctions for this paticular project useage, does not consider
5 * more about reuse, the data type all use double and assume two dimessional
6 * matrix with same dinmenstion length(square matrix).
7 */
8 public class MatrixHelp {
9
10 /** retrun squared distance of vector P1 and P2*/
11 public static double distance(double[] P1, double[] P2) {
12 int len = P1.length; //P1.length must equal to P2.length
13 double sum = 0;
14 for (int i = 0; i < len; i++)
15 sum += (P1[i] - P2[i]) * (P1[i] - P2[i]);
16 return sum;
17 }
18
19 /** say H = transpose([a,b,c]) normal H means let H = transpose([a,b,c]/sqrt(a^2+b^2+c^2))*/
20 public static void normal(double[] P) {
21 int len = P.length;
22 double sum = 0;
23 for (int i = 0; i < len; i++)
24 sum += P[i];
25 for (int i = 0; i < len; i++)
26 P[i] /= sum;
27 }
28
29 /** return value of ||P||*/
30 public static double normValue(double[] P) {
31 double sum = 0;
32 for (int i = 0; i < P.length; i++)
33 sum += Math.abs(P[i]);
34 return sum;
35 }
36
37 /** sub return P1 - P2*/
38 public static double[] subVec(double[] P1, double[] P2) {
39 int len = P1.length;
40 double[] Result = new double[len];
41 for (int i = 0; i < len; i++)
42 Result[i] = P1[i] - P2[i];
43 return Result;
44 }
45
46 /** merge return (P1 + P2)/2.0*/
47 public static double[] mergeVec(double[] P1, double[] P2) {
48 int len = P1.length;
49 double[] Result = new double[len];
50 for (int i = 0; i < len; i++)
51 Result[i] = (P1[i] + P2[i])/2.0;
52 return Result;
53 }
54
55
56 /** normalize all the value in vec P to 0.001-0.0.9995*/
57 public static void normalize(double[] P) {
58 double dMaxValue = -1.0;
59 double dMinValue = 2.0;
60 int len = P.length;
61 for (int i = 0; i < len; i++) {
62 if (P[i] < dMinValue)
63 dMinValue = P[i];
64 if (P[i] > dMaxValue)
65 dMaxValue = P[i];
66 }
67 if (dMaxValue - dMinValue < 0.0000001)
68 return;
69 for (int i = 0; i < P.length; i++) {
70 if(P[i] <= (dMinValue*1.005)) //do not use boundary value
71 P[i]= dMinValue*1.005;
72 if(P[i] >= (dMaxValue*0.995))
73 P[i] = dMaxValue*0.995;
74 P[i]=(P[i]-dMinValue)/(dMaxValue-dMinValue);
75 P[i] = ((0.5-0.001)/0.5)*P[i] + 0.001;
76 }
77 }
78 /** return the result matrix of matrix A mutliply (transpose of A)*/
79 public static double[][] matrixMultiply(double[][] A) {
80 int len = A[0].length;
81 double[][] Result = new double[len][len];
82 for (int i = 0; i < len; i++)
83 for (int j = 0; j < len; j++)
84 for (int k = 0; k < len; k++)
85 Result[i][j] += A[i][k] * A[j][k]; //A[i][k] * AT[k][j]
86 return Result;
87 }
88
89 /** return the result matrix of matrix A mutliply matrix B,A and B of same dimention size*/
90 public static double[][] matrixMultiply(double[][] A, double[][] B) {
91 int len = A[0].length;
92 double[][] Result = new double[len][len];
93 for (int i = 0; i < len; i++)
94 for (int j = 0; j < len; j++)
95 for (int k = 0; j < len; k++)
96 Result[i][j] += A[i][k] * B[k][j];
97 return Result;
98 }
99 /** return the result vector of matrix A mutliply vector P*/
100 public static double[] matrixMultiplyVector(double[][] A, double[] P) {
101 int len = P.length;
102 double[] Result = new double[len];
103 for (int i = 0; i < len; i++)
104 for (int j = 0; j < len; j++)
105 Result[i] += A[i][j] * P[j];
106 return Result;
107 }
108
109
110 /** return the result vector of first transposing matrix A than mutliply vector P*/
111 public static double[] matrixTransposeMultiplyVector(double[][] A, double[] P) {
112 int len = P.length;
113 double[] Result = new double[len];
114 for (int i = 0; i < len; i++)
115 for (int j = 0; j < len; j++) {
116 Result[i] += A[j][i] * P[j];
117 }
118
119 return Result;
120 }
121
122 /** for debug show all matrix data value*/
123 public static void printMatrix(double[][] A) {
124 int len = A[0].length;
125 for (int i = 0; i < len; i++) {
126 for (int j = 0; j < len; j++) {
127 System.out.printf("%.2f", A[i][j]);
128 System.out.print(" ");
129 }
130 System.out.println("");
131 }
132 }
133
134 /** for debug show all vector data value*/
135 public static void printVector(double[] P) {
136 for (int i = 0; i < P.length; i++)
137 System.out.printf("%.2f", P[i]);
138 System.out.println("\n");
139 }
140 }
141
2 * This class has all the independent caculations related to matrix and vector
3 * that will be used in text sumarization.
4 * Notice those fuctions for this paticular project useage, does not consider
5 * more about reuse, the data type all use double and assume two dimessional
6 * matrix with same dinmenstion length(square matrix).
7 */
8 public class MatrixHelp {
9
10 /** retrun squared distance of vector P1 and P2*/
11 public static double distance(double[] P1, double[] P2) {
12 int len = P1.length; //P1.length must equal to P2.length
13 double sum = 0;
14 for (int i = 0; i < len; i++)
15 sum += (P1[i] - P2[i]) * (P1[i] - P2[i]);
16 return sum;
17 }
18
19 /** say H = transpose([a,b,c]) normal H means let H = transpose([a,b,c]/sqrt(a^2+b^2+c^2))*/
20 public static void normal(double[] P) {
21 int len = P.length;
22 double sum = 0;
23 for (int i = 0; i < len; i++)
24 sum += P[i];
25 for (int i = 0; i < len; i++)
26 P[i] /= sum;
27 }
28
29 /** return value of ||P||*/
30 public static double normValue(double[] P) {
31 double sum = 0;
32 for (int i = 0; i < P.length; i++)
33 sum += Math.abs(P[i]);
34 return sum;
35 }
36
37 /** sub return P1 - P2*/
38 public static double[] subVec(double[] P1, double[] P2) {
39 int len = P1.length;
40 double[] Result = new double[len];
41 for (int i = 0; i < len; i++)
42 Result[i] = P1[i] - P2[i];
43 return Result;
44 }
45
46 /** merge return (P1 + P2)/2.0*/
47 public static double[] mergeVec(double[] P1, double[] P2) {
48 int len = P1.length;
49 double[] Result = new double[len];
50 for (int i = 0; i < len; i++)
51 Result[i] = (P1[i] + P2[i])/2.0;
52 return Result;
53 }
54
55
56 /** normalize all the value in vec P to 0.001-0.0.9995*/
57 public static void normalize(double[] P) {
58 double dMaxValue = -1.0;
59 double dMinValue = 2.0;
60 int len = P.length;
61 for (int i = 0; i < len; i++) {
62 if (P[i] < dMinValue)
63 dMinValue = P[i];
64 if (P[i] > dMaxValue)
65 dMaxValue = P[i];
66 }
67 if (dMaxValue - dMinValue < 0.0000001)
68 return;
69 for (int i = 0; i < P.length; i++) {
70 if(P[i] <= (dMinValue*1.005)) //do not use boundary value
71 P[i]= dMinValue*1.005;
72 if(P[i] >= (dMaxValue*0.995))
73 P[i] = dMaxValue*0.995;
74 P[i]=(P[i]-dMinValue)/(dMaxValue-dMinValue);
75 P[i] = ((0.5-0.001)/0.5)*P[i] + 0.001;
76 }
77 }
78 /** return the result matrix of matrix A mutliply (transpose of A)*/
79 public static double[][] matrixMultiply(double[][] A) {
80 int len = A[0].length;
81 double[][] Result = new double[len][len];
82 for (int i = 0; i < len; i++)
83 for (int j = 0; j < len; j++)
84 for (int k = 0; k < len; k++)
85 Result[i][j] += A[i][k] * A[j][k]; //A[i][k] * AT[k][j]
86 return Result;
87 }
88
89 /** return the result matrix of matrix A mutliply matrix B,A and B of same dimention size*/
90 public static double[][] matrixMultiply(double[][] A, double[][] B) {
91 int len = A[0].length;
92 double[][] Result = new double[len][len];
93 for (int i = 0; i < len; i++)
94 for (int j = 0; j < len; j++)
95 for (int k = 0; j < len; k++)
96 Result[i][j] += A[i][k] * B[k][j];
97 return Result;
98 }
99 /** return the result vector of matrix A mutliply vector P*/
100 public static double[] matrixMultiplyVector(double[][] A, double[] P) {
101 int len = P.length;
102 double[] Result = new double[len];
103 for (int i = 0; i < len; i++)
104 for (int j = 0; j < len; j++)
105 Result[i] += A[i][j] * P[j];
106 return Result;
107 }
108
109
110 /** return the result vector of first transposing matrix A than mutliply vector P*/
111 public static double[] matrixTransposeMultiplyVector(double[][] A, double[] P) {
112 int len = P.length;
113 double[] Result = new double[len];
114 for (int i = 0; i < len; i++)
115 for (int j = 0; j < len; j++) {
116 Result[i] += A[j][i] * P[j];
117 }
118
119 return Result;
120 }
121
122 /** for debug show all matrix data value*/
123 public static void printMatrix(double[][] A) {
124 int len = A[0].length;
125 for (int i = 0; i < len; i++) {
126 for (int j = 0; j < len; j++) {
127 System.out.printf("%.2f", A[i][j]);
128 System.out.print(" ");
129 }
130 System.out.println("");
131 }
132 }
133
134 /** for debug show all vector data value*/
135 public static void printVector(double[] P) {
136 for (int i = 0; i < P.length; i++)
137 System.out.printf("%.2f", P[i]);
138 System.out.println("\n");
139 }
140 }
141
//Ranker.java
1 /**
2 * Ranker an absturct class for calculating rank score of all sentences
3 * Right now there are two implementations
4 * PageRankRanker using pagerank algorithom and HitsRanker using hits
5 * algorithom.
6 */
7 public interface Ranker {
8 /**
9 * Given the Weight matrix A
10 * return the vec needed after doing iterate algorithm,eg pagerank, hits.
11 */
12 public double[] getRankVec(double[][] A);
13 }
2 * Ranker an absturct class for calculating rank score of all sentences
3 * Right now there are two implementations
4 * PageRankRanker using pagerank algorithom and HitsRanker using hits
5 * algorithom.
6 */
7 public interface Ranker {
8 /**
9 * Given the Weight matrix A
10 * return the vec needed after doing iterate algorithm,eg pagerank, hits.
11 */
12 public double[] getRankVec(double[][] A);
13 }
//PageRankRanker.java
//pager rank 算法的实现
1 /**
2 * The implementaion of Ranker based on pagerank algorithm.
3 */
4 public class PageRankRanker implements Ranker {
5
6 public double[] getRankVec(double[][] A) {
7 return pageRankIterate(getInitMatrix(A), 0.85);
8 }
9
10
11 /**
12 * Kernal of page rank iterate
13 * Taking A as input matrixand d(damping factor,use 0.85 will be ok)
14 * and return the result vector with weight value
15 * for each sentence
16 */
17 public static double[] pageRankIterate(double[][] A, double d) {
18 int len = A[0].length;
19 double[][] P= new double[2][len];
20 int now = 0;
21 int other = 1;
22 int temp;
23 for (int i = 0; i < len; i++)
24 P[now][i] = 1.0/len;
25
26 double threshould = 0.0000001; //spell wrong?
27 do {
28 P[other] = MatrixHelp.matrixMultiplyVector(A, P[now]);
29 for (int i = 0; i < len; i++)
30 //P[other][i] = d * P[other][i] + (1 - d);
31 P[other][i] = d * P[other][i] + (1 - d);
32 temp = now;
33 now = other;
34 other = temp;
35 } while(MatrixHelp.distance(P[now], P[other]) > threshould);
36
37 MatrixHelp.normalize(P[now]);
38 return P[now];
39 }
40
41 /**
42 * Get the init matrix for ranker to use based
43 * on input weight matrix A.
44 */
45 private static double[][] getInitMatrix(double[][] A) {
46 int len = A[0].length;
47 double[][] B = new double[len][len];
48 //for page rank init matrix
49 double weightSum;
50 for (int j = 0; j < len; j++) {
51 weightSum = 0.0;
52 for(int k = 0; k < len; k++) {
53 //weightSum += A[k][j];
54 weightSum += A[j][k];
55 }
56 for (int i = 0; i < len; i++) {
57 if (weightSum > 0.0000001) {
58 B[i][j] = A[i][j] / weightSum;
59 //B[i][j] = A[j][i] /weightSum;
60 } else if (i == j) {
61 B[i][j] = 0.0;
62 } else {
63 B[i][j] = 1.00/(len - 1);
64 }
65 }
66 }
67 return B;
68 }
69
70 }
71
2 * The implementaion of Ranker based on pagerank algorithm.
3 */
4 public class PageRankRanker implements Ranker {
5
6 public double[] getRankVec(double[][] A) {
7 return pageRankIterate(getInitMatrix(A), 0.85);
8 }
9
10
11 /**
12 * Kernal of page rank iterate
13 * Taking A as input matrixand d(damping factor,use 0.85 will be ok)
14 * and return the result vector with weight value
15 * for each sentence
16 */
17 public static double[] pageRankIterate(double[][] A, double d) {
18 int len = A[0].length;
19 double[][] P= new double[2][len];
20 int now = 0;
21 int other = 1;
22 int temp;
23 for (int i = 0; i < len; i++)
24 P[now][i] = 1.0/len;
25
26 double threshould = 0.0000001; //spell wrong?
27 do {
28 P[other] = MatrixHelp.matrixMultiplyVector(A, P[now]);
29 for (int i = 0; i < len; i++)
30 //P[other][i] = d * P[other][i] + (1 - d);
31 P[other][i] = d * P[other][i] + (1 - d);
32 temp = now;
33 now = other;
34 other = temp;
35 } while(MatrixHelp.distance(P[now], P[other]) > threshould);
36
37 MatrixHelp.normalize(P[now]);
38 return P[now];
39 }
40
41 /**
42 * Get the init matrix for ranker to use based
43 * on input weight matrix A.
44 */
45 private static double[][] getInitMatrix(double[][] A) {
46 int len = A[0].length;
47 double[][] B = new double[len][len];
48 //for page rank init matrix
49 double weightSum;
50 for (int j = 0; j < len; j++) {
51 weightSum = 0.0;
52 for(int k = 0; k < len; k++) {
53 //weightSum += A[k][j];
54 weightSum += A[j][k];
55 }
56 for (int i = 0; i < len; i++) {
57 if (weightSum > 0.0000001) {
58 B[i][j] = A[i][j] / weightSum;
59 //B[i][j] = A[j][i] /weightSum;
60 } else if (i == j) {
61 B[i][j] = 0.0;
62 } else {
63 B[i][j] = 1.00/(len - 1);
64 }
65 }
66 }
67 return B;
68 }
69
70 }
71
//HitsRanker.java
//hits算法的实现
1 /**
2 * The implementation of Ranker based on hits algorithm.
3 */
4 public class HitsRanker implements Ranker {
5
6 public double[] getRankVec(double[][] A) {
7 return hitsIterate(A);
8 }
9
10 /** kernal of hits alogrithm*/
11 public static double[] hitsIterate(double[][] B) {
12 int len = B[0].length;
13 double[][] L = MatrixHelp.matrixMultiply(B); //B*BT
14
15 double[][] H = new double[2][len]; //hub
16 double[][] A = new double[2][len]; //authority
17 int now = 0;
18 int other = 1;
19 int temp;
20
21 for (int i = 0; i < len; i++) {
22 H[now][i] = A[now][i] = 1.0;
23 }
24 double threshould = 0.0000001;
25 do {
26 A[other] = MatrixHelp.matrixTransposeMultiplyVector(L, A[now]);
27 H[other] = MatrixHelp.matrixMultiplyVector(L, H[now]);
28 MatrixHelp.normal(A[other]);
29 MatrixHelp.normal(H[other]);
30 //swap now and other
31 temp = now;
32 now = other;
33 other = temp;
34 }while(MatrixHelp.normValue(MatrixHelp.subVec(H[now], H[other])) > threshould &&
35 MatrixHelp.normValue((MatrixHelp.subVec(A[now], A[other])))> threshould);
36
37 double[] R = MatrixHelp.mergeVec(A[now], H[now]);
38 MatrixHelp.normalize(R);
39 return R;
40 }
41 }
42
2 * The implementation of Ranker based on hits algorithm.
3 */
4 public class HitsRanker implements Ranker {
5
6 public double[] getRankVec(double[][] A) {
7 return hitsIterate(A);
8 }
9
10 /** kernal of hits alogrithm*/
11 public static double[] hitsIterate(double[][] B) {
12 int len = B[0].length;
13 double[][] L = MatrixHelp.matrixMultiply(B); //B*BT
14
15 double[][] H = new double[2][len]; //hub
16 double[][] A = new double[2][len]; //authority
17 int now = 0;
18 int other = 1;
19 int temp;
20
21 for (int i = 0; i < len; i++) {
22 H[now][i] = A[now][i] = 1.0;
23 }
24 double threshould = 0.0000001;
25 do {
26 A[other] = MatrixHelp.matrixTransposeMultiplyVector(L, A[now]);
27 H[other] = MatrixHelp.matrixMultiplyVector(L, H[now]);
28 MatrixHelp.normal(A[other]);
29 MatrixHelp.normal(H[other]);
30 //swap now and other
31 temp = now;
32 now = other;
33 other = temp;
34 }while(MatrixHelp.normValue(MatrixHelp.subVec(H[now], H[other])) > threshould &&
35 MatrixHelp.normValue((MatrixHelp.subVec(A[now], A[other])))> threshould);
36
37 double[] R = MatrixHelp.mergeVec(A[now], H[now]);
38 MatrixHelp.normalize(R);
39 return R;
40 }
41 }
42