-
Notifications
You must be signed in to change notification settings - Fork 45
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Kevin Maik Jablonka
committed
Oct 13, 2023
1 parent
18710d5
commit 35d9cd0
Showing
2 changed files
with
88 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
--- | ||
name: mattermodeling_stackexchange | ||
description: |- | ||
Questions and answers mined from mattermodeling.stackexchange.com. | ||
targets: | ||
- id: a | ||
description: answer to the question | ||
type: string | ||
identifiers: | ||
- id: q | ||
type: string | ||
description: question asked on mattermodeling.stackexchange.com | ||
license: BSD-3-Clause | ||
links: | ||
- url: mattermodeling.stackexchange.com | ||
description: original data source | ||
- url: https://stackoverflow.com/help/licensing | ||
description: information about the license | ||
num_points: 2571 | ||
templates: | ||
- |- | ||
{#Task: Please answer the question of the user.|Task: Provide a detailed response to the user's question.|Task: Address the user's query with a well-structured answer.|Task: Your role is to respond to the user's question with clarity.|Task: Offer a concise and informative answer to the user's question.|Task: Provide a clear and concise reply to the user's inquiry.!} | ||
{#User: |Question: |Inquiry: |\n!}{#q} | ||
{#Assistant: |Answer: !}{#a} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
from datasets import load_dataset | ||
import pandas as pd | ||
|
||
|
||
def remove_repeated_almost_empty_lines(text): | ||
# in text remove lines that are almost empty and repeated | ||
# almost empty means that there is only punctuation, or special characters or spaces | ||
# repeated means that the line is the same as the previous one | ||
# return the text without the repeated almost empty lines | ||
# found by manual inspection that this is an issue in some of the raw text | ||
lines = text.split("\n") | ||
new_lines = [] | ||
previous_line = "" | ||
for line in lines: | ||
if line.strip() == "": | ||
continue | ||
if line.strip() == previous_line: | ||
continue | ||
new_lines.append(line) | ||
previous_line = line.strip() | ||
return "\n".join(new_lines) | ||
|
||
|
||
def get_clean_df(): | ||
dataset = load_dataset("marianna13/mattermodeling-stackexchange") | ||
questions_w_answer = [] | ||
df = dataset["train"].to_pandas() | ||
# we do the following, if there is no answer, we drop this question | ||
# if there is one answer, we keep it | ||
# if there are multiple we keep the ones that do not have a score of 0 | ||
# the answers are in an array of arrays, the first element is the answer, the second is the score | ||
# we then also only keep two columns, the question and the answer, both as string on which we also | ||
# call the strip function to remove leading and trailing whitespaces | ||
|
||
for i, row in df.iterrows(): | ||
if len(row["answers"]) == 0: | ||
continue | ||
if len(row["answers"]) == 1: | ||
questions_w_answer.append( | ||
[ | ||
remove_repeated_almost_empty_lines(row["question_text"].strip()), | ||
remove_repeated_almost_empty_lines(row["answers"][0][0].strip()), | ||
] | ||
) | ||
else: | ||
for answer in row["answers"]: | ||
if answer[1] != 0: | ||
questions_w_answer.append( | ||
[ | ||
remove_repeated_almost_empty_lines( | ||
row["question_text"].strip() | ||
), | ||
remove_repeated_almost_empty_lines(answer[0].strip()), | ||
] | ||
) | ||
break | ||
|
||
# we then create a dataframe from the list of questions and answers | ||
df_qa = pd.DataFrame(questions_w_answer, columns=["q", "a"]) | ||
df_qa.to_csv("data_clean.csv", index=False) | ||
|
||
|
||
if __name__ == "__main__": | ||
get_clean_df() |