-
Notifications
You must be signed in to change notification settings - Fork 2
/
crawl.py
executable file
·189 lines (137 loc) · 5.83 KB
/
crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
#!/usr/bin/python3
import sys,os,re,shutil
# Must import storage before utils
import update_settings as settings
from storage import storage
storage.uri = settings.MONGODB_URI
storage.db_name = settings.MONGODB_DB_NAME
storage.connect()
from time import time
from ftplib import FTP
from defs import CRAWLTYPE_LINE as LINE, CRAWLTYPE_FILE as FILE
from utils import (download, entries_by_pdbid, get_present_entries,
get_missing_entries, read_http, parse_regex, valid_path)
# Each entry has a pdbid and databank_name, this is what makes it unique.
# An entry might also have a file path (present entries) or comment (annotated entries).
# The mtime field tells when the file or comment was added. (number of seconds since january the 1st, 1970, 00:00:00)
# This function is used for file crawling:
def get_pathnames (path):
if path.startswith ("ftp://"):
if not path.endswith ('/'):
path = path + '/'
host = path.split('/')[2]
_dir = path [7 + len(host): ]
ftp = FTP (host)
ftp.login ()
h = []
for f in ftp.nlst(_dir):
h.append (path + f)
ftp.quit()
return h
elif os.path.isdir (path):
h = []
for root, dirs, files in os.walk (path):
for f in files:
if 'obsolete' in f or os.path.splitext (f)[1] in ['.gif', '.html']:
continue # skip images and web pages to save time
f = os.path.join (root, f)
h.append (f)
return h
else:
raise Exception ("invalid path to get files from: " + path)
# This function is used for line crawling:
def get_lines (path):
if path.startswith ('http://') or path.startswith ('ftp://'):
return read_http (path).split ('\n')
elif os.path.isfile (path):
return open (path, 'r').readlines ()
else:
raise Exception ("invalid path to get lines from: " + path)
def remove_changed (databank, lines=[]):
pattern = parse_regex(databank['regex'])
line_matches = {}
if databank ['crawltype'] == LINE:
for line in lines:
m = pattern.search (line)
if m:
line_matches [m.group (1)] = line
# Remove entries where the file's mtime has changed or where the
# actual file/line was removed or doesn't match the pattern anymore:
for entry in get_present_entries (databank['name']):
path = entry ['filepath']
if databank ['crawltype'] == FILE and \
(not valid_path (databank['name'], path) or \
os.path.getmtime (path) != entry['mtime']):
storage.remove ('entries', {'databank_name': databank['name'], 'pdbid': entry['pdbid']})
elif databank ['crawltype'] == LINE and \
(not os.path.isfile (path) or \
os.path.getmtime (path) != entry['mtime'] or \
entry ['pdbid'] not in line_matches):
storage.remove ('entries', {'databank_name': databank['name'], 'pdbid': entry['pdbid']})
def crawl_files (databank, pathnames):
present_entries_bypdbid = entries_by_pdbid (get_present_entries (databank ['name']))
record_pdbids = entries_by_pdbid (storage.find ('entries', {'databank_name': databank ['name']}, {'pdbid':1}))
pattern = parse_regex (databank['regex'])
for f in pathnames:
# Only use files that match the databank's pattern.
m = pattern.search(f)
if not m:
continue
# For disk files take their mtimes, for urls take current time.
mtime = time ()
if os.path.isfile (f):
mtime = os.path.getmtime (f)
entry = {
'databank_name': databank['name'],
'pdbid': m.group(1).lower(),
'filepath': f,
'mtime': mtime
}
if entry ['pdbid'] in present_entries_bypdbid:
continue
if entry ['pdbid'] in record_pdbids:
storage.update ('entries', {'databank_name': databank ['name'], 'pdbid': entry ['pdbid']}, entry)
else:
storage.insert ('entries', entry)
def crawl_lines (databank, filepath, lines):
present_entries_bypdbid = entries_by_pdbid(get_present_entries(databank['name']))
record_pdbids = entries_by_pdbid(storage.find('entries',{'databank_name':databank['name']}, {'pdbid':1}))
pattern = parse_regex(databank['regex'])
# If it's a disk file take its mtime, for urls take current time.
mtime = time()
if os.path.isfile (filepath):
mtime = os.path.getmtime (filepath)
for line in lines:
# Only use lines that match the databank's pattern
m = pattern.search (line)
if not m:
continue
entry = {
'databank_name': databank['name'],
'pdbid': m.group(1).lower(),
'filepath': filepath,
'mtime': mtime
}
if entry['pdbid'] in present_entries_bypdbid:
continue
if entry['pdbid'] in record_pdbids:
storage.update('entries', {'databank_name':databank['name'], 'pdbid':entry['pdbid']}, entry)
else:
storage.insert('entries', entry)
if not len(sys.argv) == 3:
print('Usage: %s [databank name] [source]' % sys.argv[0])
sys.exit(0)
databank_name = sys.argv [1]
source = sys.argv [2]
databank = storage.find_one ('databanks', {'name':databank_name, 'crawltype':{'$in':[LINE,FILE]}})
if not databank:
raise Exception ('not found or unknown crawl type: ' + databank_name)
# On urls, we can only use the line crawler for now.
if source.startswith ('http://') or source.startswith ('ftp://') or os.path.isfile (source):
lines = get_lines (source)
remove_changed (databank, lines)
crawl_lines (databank, source, lines)
elif os.path.isdir(source):
files = get_pathnames (source)
remove_changed (databank)
crawl_files (databank, files)