This repository has been archived by the owner on Sep 28, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
extract_key_phrases.py
147 lines (112 loc) · 5.71 KB
/
extract_key_phrases.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# Script to use the Cognitive Services key phrase extraction API to process the contents
# of source files listed in a CSV file. The input file should be relatively small to minimize
# service usage (and not exceed free tier limits).
#
# Yo provide the endpoint and API key for your specific subscription through command line args.
import http.client, urllib.request, urllib.parse, urllib.error, base64
import requests
import json
import numpy as np
import sys
from utilities import parse_endpoint_key_arguments, delineate_segments
import time
def get_key_phrases(endpoint, headers, params, body_text):
try:
conn = http.client.HTTPSConnection(endpoint)
conn.request("POST", "/text/analytics/v2.0/keyPhrases?%s" % params, body_text, headers)
response = conn.getresponse()
data = response.read()
conn.close()
return data
except Exception as e:
print("[Errno {0}] {1}".format(e.errno, e.strerror))
return None
def split_at_last_paragraph(text, max_length):
if (len(text) < max_length):
return (text, None)
else:
# Find the last \n up to max_length
pos1 = text.rfind('\n', 0, max_length)
# If for some reason there's no new line, just chop at
# the length boundary; we might occasionally lose an important
# word here, so we could be more sophisticated if we want, but
# this is good enough to start.
if (pos1 == -1):
pos1 = max_length
return (text[:pos1], text[(pos1 + 1):])
def extract_key_phrases(endpoint, key, input_file, output_file):
print("extract-key-phrases: Starting key phrase extraction")
# Constants -- this limit is imposed by the API
max_doc_length = 5000
headers = {
# Request headers
'Content-Type': 'application/json',
'Ocp-Apim-Subscription-Key': key,
}
params = urllib.parse.urlencode({})
all_phrases = []
with open(input_file, encoding='utf-8') as f_in:
import csv
# Output CSV has the same structure with added KeyPhrases column
reader = csv.reader(f_in)
csv_headers = next(reader)
csv_headers.append("key_phrases")
index_file = csv_headers.index("file")
with open(output_file, 'w', encoding='utf-8', newline='') as f_out:
writer = csv.writer(f_out)
writer.writerow(csv_headers)
# Now go through each source file listed in the .csv, and run key phrase extraction on
# the introductory text.
for row in reader:
filename = row[index_file]
if filename == '':
continue
with open(filename, 'r', encoding="utf-8") as f_source:
print("extract_key_phrases: Processing %s" % (filename))
text = f_source.read()
#_, intro_lines, metadata_lines = delineate_segments(content, filename)
# Strip off header by finding position of second "---"
blocks = text.split("---", 2)
if len(blocks) < 3:
print("extract_key_phrases: Skipping file that appears to have a malformed metadata header, %s" % (filename))
continue;
# Use just the text past the header
text = blocks[2]
# Separate to the next ## (any other subheading)
blocks = text.split("##", 1)
text = blocks[0][0:max_doc_length]
documents = []
documents.append({ "language": "en", "id": 1, "text": text })
body_json = {"documents": documents}
# data = get_key_phrases(endpoint, headers, params, json.dumps(body_json))
# data_json = json.loads(data)
keyphrase_uri = endpoint + "/keyphrases"
response = requests.post(keyphrase_uri, headers=headers, json=body_json)
# Combine the list of key phrases for the documents, which is then the
# key phrases for the source file as a whole.
key_phrases = []
if response.status_code == 200:
for doc in response.json()["documents"]:
key_phrases = sorted(key_phrases + doc["keyPhrases"])
all_phrases[-1:-1] = key_phrases
else:
print("Response code %d, %s" % (response.status_code, response.text))
# Append the phrases list (; separated) to the CSV row and write it.
row.append(';'.join(key_phrases))
writer.writerow(row)
# Throttle ourselves to avoid Cognitive Services rate limits
time.sleep(1)
all_phrases = sorted(np.unique(all_phrases))
with open('phraselist.txt', 'w') as phrase_file:
phrase_file.writelines([str(phrase) + "\n" for phrase in all_phrases])
print("extract_key_phrases: Completed extraction")
if __name__ == "__main__":
endpoint, key, args = parse_endpoint_key_arguments(sys.argv[1:])
if endpoint == None or key == None or args == None:
print("Usage: python extract_key_phrases.py --endpoint <endpoint_url> --key <api_key> <input_file>")
sys.exit(2)
# Making the output filename assumes the input filename has only one .
input_file = args[0]
elements = input_file.split('.')
output_file = elements[0] + '-keyphrases.' + elements[1]
extract_key_phrases(endpoint, key, input_file, output_file)