-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfind_bad_lines.py
34 lines (25 loc) · 1.43 KB
/
find_bad_lines.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# this script detects which line caused HuggingFace Transformers to crash
import pandas as pd
# go through the file and find rows where entries are not either strings or integers
FILE_PATH = "your_dataset_here.csv"
SHOW_SPECIFIC_BAD_LINES = True
# open the file
df = pd.read_csv(FILE_PATH)
# make sure that the only columns are "sentence1", "sentence2" (optional), and "label"
if False in [col in ["sentence1", "sentence2", "label"] for col in df.columns]:
print(f"The file should only have columns named \"sentence1\", \"sentence2\", and \"label\". Instead, the columns are: {df.columns}")
# find the rows where the column labeled "sentence1" is not a string
df_bad_lines = df[df["sentence1"].apply(lambda x: not isinstance(x, str))]
if "sentence2" in df.columns:
# add in the rows where the column labeled "sentence2" is not a string
df_bad_lines = pd.concat([df_bad_lines, df[df["sentence2"].apply(lambda x: not isinstance(x, str))]])
# add in the rows where the column labeled "label" is not a float or int
# use concat instead of append because append is deprecated
df_bad_lines = pd.concat([df_bad_lines, df[df["label"].apply(lambda x: not isinstance(x, (float, int)))]])
# print all the bad lines as a list
print(f"All bad lines:\n{df_bad_lines.index.tolist()}")
if SHOW_SPECIFIC_BAD_LINES:
# display the line number and the contents of each bad row
for index, row in df_bad_lines.iterrows():
print(index)
print(row)