-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex.sh
executable file
·131 lines (110 loc) · 4.55 KB
/
index.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#!/usr/bin/env bash
# Setup variables.
COLLECTION_PATH=$1
INDEX=$2
COLLECTION_FORMAT=$3
BULK_SIZE=1000
# Portions of this code copied from https://github.com/osirrc/indri-docker.
# The mounted collection folder is read-only, we need a writable folder.
COLLECTION_PATH_WRITABLE=${COLLECTION_PATH}"-WRITABLE"
echo "copying files of directory ${COLLECTION_PATH} into ${COLLECTION_PATH_WRITABLE}"
cp -r ${COLLECTION_PATH} ${COLLECTION_PATH_WRITABLE}
echo "done!"
if [[ ${INDEX} == "robust04" ]]
then
BULK_SIZE=10
# Remove the unwanted parts of disk45 (as per ROBUST04 guidelines)
rm -r ${COLLECTION_PATH_WRITABLE}/disk4/cr
rm -r ${COLLECTION_PATH_WRITABLE}/disk4/dtds
rm -r ${COLLECTION_PATH_WRITABLE}/disk5/dtds
rm -r ${COLLECTION_PATH_WRITABLE}/disk4/fr94/aux/
rm ${COLLECTION_PATH_WRITABLE}/disk4/ft/readfrcg.z
rm ${COLLECTION_PATH_WRITABLE}/disk4/ft/readmeft.z
rm ${COLLECTION_PATH_WRITABLE}/disk4/fr94/readchg.z
rm ${COLLECTION_PATH_WRITABLE}/disk4/fr94/readmefr.z
rm ${COLLECTION_PATH_WRITABLE}/disk5/latimes/readmela.txt
rm ${COLLECTION_PATH_WRITABLE}/disk5/latimes/readchg.txt
# Robust04 has a folder with `NAME.0z`, `NAME.1z` and `NAME.2z` files, simply using gunzip
# is not an option as files are being overwritten (same name, different suffix);
# hacked solution: add ".z" to every single file in the collection path.
find ${COLLECTION_PATH_WRITABLE} -maxdepth 100 -name "*.0z" -type f -exec mv '{}' '{}'.z \;
find ${COLLECTION_PATH_WRITABLE} -maxdepth 100 -name "*.1z" -type f -exec mv '{}' '{}'.z \;
find ${COLLECTION_PATH_WRITABLE} -maxdepth 100 -name "*.2z" -type f -exec mv '{}' '{}'.z \;
# Decompress.
echo "robust04 ... decompressing"
gunzip --suffix=".z" -r ${COLLECTION_PATH_WRITABLE}
echo "done!"
fi
if [[ ${INDEX} == "core17" ]]
then
rm -r ${COLLECTION_PATH_WRITABLE}/docs
rm -r ${COLLECTION_PATH_WRITABLE}/dtd
rm -r ${COLLECTION_PATH_WRITABLE}/tools
rm ${COLLECTION_PATH_WRITABLE}/index.html
# find ${COLLECTION_PATH_WRITABLE} -name "*.tgz" -type f -exec mv '{}' '{}'.z \;
echo "core17 ... decompressing"
gunzip --suffix=".tgz" -r ${COLLECTION_PATH_WRITABLE}
mkdir -p ${COLLECTION_PATH_WRITABLE}/decompressed
find ${COLLECTION_PATH_WRITABLE}/data -maxdepth 100 -type f -exec tar -xf '{}' -C ${COLLECTION_PATH_WRITABLE}/decompressed \;
rm -rf ${COLLECTION_PATH_WRITABLE}/data
echo "done!"
fi
if [[ ${INDEX} == "core18" ]]
then
rm ${COLLECTION_PATH_WRITABLE}/MD5SUMS
rm ${COLLECTION_PATH_WRITABLE}/README.md
rm -r ${COLLECTION_PATH_WRITABLE}/scripts
cd ${COLLECTION_PATH_WRITABLE}/data/
split -l 1 TREC_Washington_Post_collection.v2.jl
rm ${COLLECTION_PATH_WRITABLE}/data/TREC_Washington_Post_collection.v2.jl
cd /
fi
# Wait for Elasticsearch.
./eswait.sh
# Create the index.
curl -s -H "Content-Type: application/json" -X PUT localhost:9200/${INDEX}?wait_for_active_shards=1 -d '{"settings": {"number_of_shards": 4}}'; echo
curl -s -H 'Content-Type: application/json' -X PUT localhost:9200/_settings -d '{ "index": { "refresh_interval": "60s"}}'; echo
function do_request {
# We have a parsed file, now try to index it.
STATUS=$(curl -s -w "%{http_code}" -o resp -H "Content-Type: application/x-ndjson" -X POST localhost:9200/${INDEX}/_bulk --data-binary "@requests")
if [[ ${STATUS} != 200 ]]
then
# Can't index the file, so what's the error?
printf "[X]\n"
echo "###### RESPONSE: ######"
cat resp; echo
else
# Okay, great, we indexed the file.
printf "[√]\n"
fi
# Remove the requests file.
[[ -e requests ]] && rm requests
}
# Iterate over each file in the collection path, parsing each
# one as it sees it, then bulk indexing the file.
I=0
for filename in $(find ${COLLECTION_PATH_WRITABLE} -type f); do
echo "parsing ${filename} (${I}/${BULK_SIZE} for bulk index)"
# Try to parse the file.
cat ${filename} | ./ielab_cparser ${INDEX} ${COLLECTION_FORMAT} trecweb >> requests
if [[ ! -e requests ]]
then
# We were unable to parse the file...
printf "[X] - couldn't parse!\n"
elif (( I >= BULK_SIZE ))
then
do_request
I=0
fi
I=$((${I}+1))
done
if [[ $(wc -l requests) > 0 ]]
then
echo "issuing remaining documents for bulk indexing"
do_request
fi
# Remove the resp file.
[[ -e resp ]] && rm resp
curl -s -o /dev/null -X POST localhost:9200/${INDEX}/_refresh?pretty
curl -s -X GET localhost:9200/_cluster/health?pretty
curl -s -X GET localhost:9200/${INDEX}/_count?pretty | grep count | sed 's/["| |,]//g'