-
Notifications
You must be signed in to change notification settings - Fork 0
/
csv_data_fix.py
107 lines (99 loc) · 3.95 KB
/
csv_data_fix.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#!/usr/bin/python3
# -*-coding: utf-8 -*-
import argparse
import csv
import os
class DataReaderFix(object):
"""
Special class to fix corrupted csv file
"""
def __init__(self, data_base_path):
self.data_base_path = data_base_path
self.sample_listings_path = os.path.join(
self.data_base_path, "za_sample_listings_incl_cat.csv")
self.saved_line = []
def read_scv(self, csv_filename, delimiter, read_line):
"""
Read csv file
:param csv_filename: path to csv data
:param delimiter: delimeter for csv file
:param read_line: function to process one line of csv file
"""
if not os.access(csv_filename, os.R_OK):
raise OSError(
"Can not read %s. There is no such file or you have no "
"permission to read it" % csv_filename)
with open(csv_filename, "r") as fin:
reader = csv.reader(fin, delimiter=delimiter)
fout = open(os.path.join(
self.data_base_path, "za_sample_listings_incl_cat_fixed.csv"),
"w")
writer = csv.writer(fout, delimiter=delimiter)
next(fin)
for line in reader:
getattr(self, read_line)(line, writer)
def read_samples_line_fix(self, line, writer):
"""
Function to process one line of csv file
:param line: line of csv file
:param writer: file to save fixed lines
:return:
"""
try:
_, item_id, seller_id, listing_title, listing_description, \
listing_price, category_sk, category_l1_name_en, \
category_l2_name_en, category_l3_name_en, listing_latitude, \
listing_longitude = line
writer.writerow(line)
except:
self.saved_line.extend(line)
if self.saved_line:
if len(self.saved_line) == 13 and "" in self.saved_line:
new_line = [elem for elem in self.saved_line if elem]
writer.writerow(new_line)
self.saved_line = []
elif (self.saved_line[0] == "227153" or
self.saved_line[0] == "307934" or
self.saved_line[0] == "485283") and \
len(self.saved_line) == 13:
new_line = []
for ind, elem in enumerate(self.saved_line):
if ind == 5:
new_elem = self.saved_line[4] + elem
new_line.append(new_elem)
elif ind == 4:
pass
else:
new_line.append(elem)
writer.writerow(new_line)
self.saved_line = []
elif self.saved_line[0] == "299779" and len(self.saved_line) >= 12:
new_line = []
for ind, elem in enumerate(self.saved_line):
if ind == 9:
new_elem = ""
for index in range(4, 9):
new_elem += self.saved_line[index]
new_elem += elem
new_line.append(new_elem)
elif ind in range(4, 9):
pass
else:
new_line.append(elem)
writer.writerow(new_line)
self.saved_line = []
elif len(self.saved_line) >= 12:
import pdb
pdb.set_trace()
def read_samples(self):
self.read_scv(
self.sample_listings_path, ",", "read_samples_line_fix")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"-p", "--base-path", type=str, help="Path to directory with scv data",
default=os.path.dirname(os.path.realpath(__file__)))
args = parser.parse_args()
base_path = args.base_path
data_reader = DataReaderFix(base_path)
data_reader.read_samples()