-
Notifications
You must be signed in to change notification settings - Fork 5
/
logique_gold.py
87 lines (83 loc) · 2.91 KB
/
logique_gold.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# -*- coding: utf-8 -*-
from __future__ import division
import sume, os, codecs, itertools
import os.path
import sys
from itertools import chain, imap
def flatmap(f):
return chain.from_iterable(f)
def to_peers_name(corpus_name,extension='txt'):
part1 = corpus_name[:5]
part2 = corpus_name[5:6]
part3 = corpus_name[7:]
return part1+'-'+part3+'.M.100.'+part2+'.'+extension
# directory from which text documents to be summarized are loaded. Input
# files are expected to be in one tokenized sentence per line format.
corpus_path = "."
corpus_dir_name = "tac2008"
#~ dest_path = "."
#~ dest_dir_name = "peers"
abst_path = "."
abst_dir_name = "abstracts"
files = []
os.chdir(corpus_path)
for root, dirs, file in os.walk(corpus_dir_name):
for d in dirs:
files.append(d)
files_number = len(files)
i = 1
max_sum_finded = 0
sum_finded = 0
# create the summarizer
for f in files:
name = corpus_dir_name+"/"+f
s = sume.ilp_models.ConceptBasedILPSummarizer(name)
sys.stdout.write(f+' pending')
# load documents
s.read_documents()
# compute the parameters needed by the model
# extract bigrams as concepts
s.extract_ngrams()
# compute document frequency as concept weights
s.compute_document_frequency()
# prune sentences that are shorter than 10 words, identical sentences and
# those that begin and end with a quotation mark
s.prune_sentences(mininum_sentence_length=10,remove_citations=True,remove_redundancy=True)
concepts = [s.sentences[j].concepts for j in range(0,len(s.sentences))]
poids = {}
for concept in concepts:
somme_poids = 0
for i in concept:
somme_poids += s.weights[i]
poids[concepts.index(concept)] = somme_poids
lengths = [s.sentences[j].length for j in range(0,len(s.sentences))]
ratio = []
for i in range(0, len(lengths)):
ratio.append(int((poids[i]/lengths[i])*10000))
best_sentences = []
best_sentences_weights = []
for i in range(0,15):
best_sentences.append(s.sentences.pop(ratio.index(max(ratio))).untokenized_form.encode("utf-8")+" ")
best_sentences_weights.append(poids[ratio.index(max(ratio))])
ratio.pop(ratio.index(max(ratio)))
if(max_sum_finded < sum_finded):
max_sum_finded = sum_finded
sum_finded = 0
for nb_sentences in range(2,7):# => for(i=2;i<7;i++)
combis = itertools.combinations(best_sentences,nb_sentences)
summaries = []
for i in list(combis):
summaries.append("".join(i))
good_summaries = []
for summary in summaries:
if(len(summary.split()) <= 100):
good_summaries.append("".join(summary))
for p in good_summaries:
#~ fo_name = to_peers_name(corpus_name=f,extension='txt.'+str(good_summaries.index(p)))
fo_name = to_peers_name(corpus_name=f,extension='txt.'+str(sum_finded))
sum_finded += 1
fo = codecs.open(abst_path+'/'+abst_dir_name+'/'+fo_name, "wb", "utf-8")
fo.write("".join(p))
fo.close()
sys.stdout.write('\r'+f+' => '+fo_name+' done\n')
sys.stdout.write('\nmax sum finded for one corpus = '+str(max_sum_finded))