forked from yifan/c4-qcri
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathc4-qcri.sh
executable file
·176 lines (139 loc) · 4.89 KB
/
c4-qcri.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
#!/bin/bash -e
# c4-qcri.sh
# QCRI's script to download and clean Arabic data from CommonCrawl. It does
# - download a given CommonCrawl dump (WET files)
# - parse and extract only pages with 'ara' in identified languages
# - output is gzipped jsonl format
#
# by Yifan Zhang (yzhang@hbku.edu.qa)
# Copyright (C) 2024, Qatar Computing Research Institute
export SCRIPT_DIR=$(dirname "$0")
# to clean up and terminate child processes
function cleanup {
echo "Interrupt signal received. Cleaning up..."
# Terminate all child processes
pkill -P $$
exit 1
}
# Trap SIGINT signal (Ctrl+C) and call cleanup function
trap cleanup SIGINT
function wget_until_success {
URL=$1
SAVETO=$2
while true; do
wget -T 120 -q -O "$SAVETO" "$URL" && break
sleep 60
done
}
export -f wget_until_success
function extract_unique_name {
WETPATH=$1
UNIQNAME=${WETPATH#*segments/}
UNIQNAME=${UNIQNAME//\//-}
echo $UNIQNAME
}
export -f extract_unique_name
function download_and_parse {
WETPATH=$1
OUTDIR=$2
# download wet file
BASENAME=$(basename $WETPATH)
SUBDIR=${OUTDIR}/${BASENAME:0:22}
UNIQNAME=$(extract_unique_name $WETPATH)
DOWNLOADED="$SUBDIR/$UNIQNAME"
GZOUTPUT="${DOWNLOADED%gz}pages.jsonl.gz"
mkdir -p $SUBDIR
# if we have downloaded wet file, we probably didn't finish
# previous job, it is better to remove both output file and
# redo this particular file
if [ -s "$DOWNLOADED" ]; then
rm -f "$GZOUTPUT"
rm -f "$DOWNLOADED"
fi
# if output file exist, skip; if not, download and process it
if [ ! -s "$GZOUTPUT" ]; then
if [ ! -s "$DOWNLOADED" ]; then
wget_until_success "https://data.commoncrawl.org/$WETPATH" "$DOWNLOADED"
fi
if [ ! -s "$DOWNLOADED" ]; then
echo "Downloading failed"
return 1
else
# check the integrity of downloaded file before processing
gzip -t "$DOWNLOADED"
if [ $? -ne 0 ]; then
echo "file may be corrupted"
rm -f "$DOWNLOADED"
if grep -q "$WETPATH" $SCRIPT_DIR/corrupted.lst; then
echo "found file in corrupted.lst, will skip and create empty output"
touch $GZOUTPUT
fi
else
echo "$GZOUTPUT"
python3 $SCRIPT_DIR/split_wet_file.py "$DOWNLOADED"
if [ $? -ne 0 ]; then
ls -lh $DOWNLOADED
echo "Failed on $DOWNLOADED"
rm $GZOUTPUT
else
if [ ! -e "$GZOUTPUT" ]; then
touch $GZOUTPUT
fi
fi
fi
fi
fi
# remove downloaded wet file to save space, only keep output
if [ -s "$GZOUTPUT" ]; then
rm -f "$DOWNLOADED"
fi
}
export -f download_and_parse
CC_VERSION=$1
NJOBS=${2-32}
DOWNLOAD_HOST="https://data.commoncrawl.org"
WET_PATH_URL="https://data.commoncrawl.org/crawl-data/CC-MAIN-CC-VERSION/wet.paths.gz"
PATHS_LST="paths.lst"
date '+%Y-%m-%d %H:%M:%S'
mkdir -p wet.paths
if [[ ! -s "wet.paths/${CC_VERSION}.wet.paths.gz" ]]; then
wget -q -O wet.paths/${CC_VERSION}.wet.paths.gz ${WET_PATH_URL/CC-VERSION/${CC_VERSION}}
fi
mkdir -p $CC_VERSION
PATHS_LST=${CC_VERSION}/paths.lst
if [ ! -s "$PATHS_LST" ]; then
if [[ "$OSTYPE" == "darwin"* ]]; then
gzcat wet.paths/${CC_VERSION}.wet.paths.gz > $PATHS_LST
else
zcat wet.paths/${CC_VERSION}.wet.paths.gz > $PATHS_LST
fi
fi
# verify that we don't have filename collision
NUM_NAMES=$(cat $PATHS_LST | while read FILENAME; do extract_unique_name $FILENAME; done | sort | uniq | wc -l)
if [ $(cat $PATHS_LST | wc -l) -ne $NUM_NAMES ]; then
echo "Failed because of naming collision before processing files, this could lead to"
echo "jobs overwrite others' output"
exit 1
fi
set +e
which parallel
NO_PARALLEL=$?
set -e
if [[ $NO_PARALLEL -eq 1 ]]; then
cat $PATHS_LST | xargs -I '{}' -P $NJOBS bash -c 'download_and_parse "$@"' _ {} ${CC_VERSION}
else
parallel --retries 10 --halt now,fail=1 --joblog $CC_VERSION/jobs.log -j $(nproc) -a "$PATHS_LST" download_and_parse {} ${CC_VERSION}
fi
# check if all download was okay
EXPECTED_NUM_FILES=$(cat $PATHS_LST | wc -l)
ACTUAL_NUM_FILES=$(find $CC_VERSION -name '*.pages.jsonl.gz' -mindepth 2 | wc -l)
if [[ $ACTUAL_NUM_FILES -lt $EXPECTED_NUM_FILES ]]; then
echo "Expecting ${EXPECTED_NUM_FILES} files only got ${ACTUAL_NUM_FILES}"
echo "Stopped at " $(date '+%Y-%m-%d %H:%M:%S')
else
find $CC_VERSION -name "CC-MAIN-*" -type d | while read CC_MAIN_DIR; do
find $CC_MAIN_DIR -name '*.pages.jsonl.gz' -exec cat {} + > ${CC_MAIN_DIR}.warc.wet.pages.jsonl.gz
done
echo "Generated $ACTUAL_NUM_FILES Expected $EXPECTED_NUM_FILES"
echo "Successfully finished at " $(date '+%Y-%m-%d %H:%M:%S')
fi