-
Notifications
You must be signed in to change notification settings - Fork 1
/
load_db.py
executable file
·59 lines (55 loc) · 2.75 KB
/
load_db.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
"""Load Ecco into PG"""
from io import StringIO
from tqdm import tqdm
import psycopg2
if __name__ == "__main__":
with psycopg2.connect(database="commonplaces", user="digging_write", password="martini", host="localhost") as conn:
cursor = conn.cursor()
cursor.execute("DROP TABLE IF EXISTS ecco")
cursor.execute(
"""create table ecco (sourceobjectid int, sourceauthor text, sourcetitle text, sourcedate smallint, sourcematchsize smallint, sourceleftcontext text, sourcematchcontext text, sourcerightcontext text, sourcephiloid text, sourcemodulename text, targetobjectid int, targetauthor text, targettitle text, targetdate smallint, targetmatchsize smallint, targetleftcontext text, targetmatchcontext text, targetrightcontext text, targetphiloid text, targetmodulename text, passageident int, passageidentcount int, authorident smallint)"""
)
with open("ecco_with_fields.tab", encoding="latin-1") as input_file:
input_file.readline() # skip first line
rows = ""
lines = 0
for pos, line in tqdm(enumerate(input_file), total=60442025):
rows += line
lines += 1
if lines == 100000:
file_obj = StringIO(rows)
cursor.copy_from(file_obj, "ecco")
rows = ""
lines = 0
if lines:
file_obj = StringIO(rows)
cursor.copy_from(file_obj, "ecco")
rows = ""
lines = 0
# Create full text search indexes
for field in [
"sourceauthor",
"targetauthor",
"sourcetitle",
"targettitle",
"sourcematchcontext",
"targetmatchcontext",
]:
print(f"Indexing {field} with trigram index...", flush=True)
cursor.execute(f"CREATE INDEX {field}_trigrams_idx ON ecco USING GIN({field} gin_trgm_ops)")
# Create btree indexes
for field in [
"sourceauthor",
"targetauthor",
"sourcedate",
"targetdate",
"sourcemodulename",
"targetmodulename",
"passageident",
]:
print(f"Indexing {field} with b-tree index...", flush=True)
cursor.execute(f"CREATE INDEX {field}_idx ON ecco USING BTREE({field})")
# We use a hash index for titles since some fields are too long for a btree index
cursor.execute("create index sourcetitle_idx on ecco using hash(sourcetitle)")
cursor.execute("create index targettitle_idx on ecco using hash(targettitle)")
conn.commit()