-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLexicalGeography.py
139 lines (106 loc) · 4.76 KB
/
LexicalGeography.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
# LexicalGeography is a tool for extracting location information from text.
# It is heavily inspired by the geotext package. However, it identifies county
# and state names as well.
# This module is in development.
import pandas as pd
from collections import namedtuple
import nltk
from nltk.collocations import *
import time
#-------------------------------------------------------------------------------
def gazetteer_maker():
# countries
country_info = pd.read_table('./01_location_data/countryInfo.txt', skiprows=50)
country_info = country_info[['Country','geonameid']]
country_info['Country'] = country_info['Country'].str.lower()
country_info.set_index('Country',inplace=True)
countries = country_info.to_dict('index')
# states
columns = ['code', 'name' , 'ASCII name', 'geonameid']
state_info = pd.read_table('./01_location_data/admin1CodesASCII.txt',header=None,names=columns)
state_info = state_info[['geonameid', 'name']]
state_info['name'] = state_info['name'].str.lower()
state_info.set_index('name',inplace=True)
states = state_info.to_dict('index')
# counties
columns = ['code', 'name' , 'ASCII name', 'geonameid']
county_info = pd.read_table('./01_location_data/admin2Codes.txt',header=None,names=columns)
county_info = county_info[['geonameid', 'name']]
county_info['name'] = county_info['name'].str.lower()
county_info.set_index('name',inplace=True)
counties = county_info.to_dict('index')
# cities
columns = ['geonameid', 'name', 'asciiname', 'alternatenames', 'latitude',
'longitude','feature class','feature code', 'country code', 'cc2',
'admin1code', 'admin2 code', 'admin3 code', 'admin4 code',
'population', 'elevation', 'dem', 'timezone','modification date']
city_info = pd.read_table('./01_location_data/cities1000.txt', header=None, names=columns)
city_info = city_info[['geonameid', 'name']]
city_info['name'] = city_info['name'].str.lower()
city_info.set_index('name',inplace=True)
cities = city_info.to_dict('index')
gazetteer = namedtuple('gazetteer', 'countries states counties cities')
return gazetteer(countries,states,counties,cities)
#-------------------------------------------------------------------------------
# class lexigeo
class lexigeo(object):
def __init__(self, text, gaz):
# define the gazetteer
self.gaz = gaz
# tokenize the text
tknzr = nltk.tokenize.TweetTokenizer()
tokens = tknzr.tokenize(text)
# get the bigrams
bigs = nltk.bigrams(tokens)
# get the trigrams
trigs = nltk.trigrams(tokens)
# convert bigrams and trigrams into list of strings
trig_list = [trig[0] + ' ' + trig[1] + ' ' + trig[2] for trig in trigs]
big_list = [big[0] + ' ' + big[1] for big in bigs]
# list of all possible samples
samples = trig_list + big_list + tokens
# identify names of countries
self.countries = [name for name in samples
if name.lower() in self.gaz.countries]
# identify names of states
self.states = [name for name in samples
if name.lower() in self.gaz.states]
# identify names of counties
self.counties = [name for name in samples
if name.lower() in self.gaz.counties]
# identify names of cities
self.cities = [name for name in samples
if name.lower() in self.gaz.cities]
#-------------------------------------------------------------------------------
# Testing
def test_gazetteer_maker():
g = gazetteer_maker()
print(g.states)
def test_lexigeo():
# header
print('------------------------------------------------------------------')
print('\nTesting LexicalGeography\n')
# build the gazetteer and reprot build time
print('------------------------------------------------------------------')
print('\nBuilding gazetteer, this could take some time...\n')
st = time.time()
gaz = gazetteer_maker()
print('\nGazetteer build time: ',time.time() - st,' seconds\n')
# define text for testing
text = "I live in Knoxville, Tennessee, but I am visiting Paris, France."
print('Testing phrase: \"', text,'\"\n')
# test the class
st = time.time()
lg = lexigeo(text,gaz)
# report results
print('------------------------------------------------------------------')
print('Results')
print('Countries: ',lg.countries)
print('States (Admin 1): ',lg.states)
print('Counties (Admin 2): ',lg.counties)
print('Cities: ',lg.cities,'\n')
print('Lookup time: ',time.time() - st,'\n')
if __name__ == '__main__':
# test_gazetteer_maker()
test_lexigeo()
# g = gazetteer_maker()