Skip to content

Commit

Permalink
add stackexchange data
Browse files Browse the repository at this point in the history
  • Loading branch information
Kevin Maik Jablonka committed Oct 13, 2023
1 parent 18710d5 commit 35d9cd0
Show file tree
Hide file tree
Showing 2 changed files with 88 additions and 0 deletions.
24 changes: 24 additions & 0 deletions data/tabular/mattermodeling_stackexchange/meta.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
---
name: mattermodeling_stackexchange
description: |-
Questions and answers mined from mattermodeling.stackexchange.com.
targets:
- id: a
description: answer to the question
type: string
identifiers:
- id: q
type: string
description: question asked on mattermodeling.stackexchange.com
license: BSD-3-Clause
links:
- url: mattermodeling.stackexchange.com
description: original data source
- url: https://stackoverflow.com/help/licensing
description: information about the license
num_points: 2571
templates:
- |-
{#Task: Please answer the question of the user.|Task: Provide a detailed response to the user's question.|Task: Address the user's query with a well-structured answer.|Task: Your role is to respond to the user's question with clarity.|Task: Offer a concise and informative answer to the user's question.|Task: Provide a clear and concise reply to the user's inquiry.!}
{#User: |Question: |Inquiry: |\n!}{#q}
{#Assistant: |Answer: !}{#a}
64 changes: 64 additions & 0 deletions data/tabular/mattermodeling_stackexchange/transform.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
from datasets import load_dataset
import pandas as pd


def remove_repeated_almost_empty_lines(text):
# in text remove lines that are almost empty and repeated
# almost empty means that there is only punctuation, or special characters or spaces
# repeated means that the line is the same as the previous one
# return the text without the repeated almost empty lines
# found by manual inspection that this is an issue in some of the raw text
lines = text.split("\n")
new_lines = []
previous_line = ""
for line in lines:
if line.strip() == "":
continue
if line.strip() == previous_line:
continue
new_lines.append(line)
previous_line = line.strip()
return "\n".join(new_lines)


def get_clean_df():
dataset = load_dataset("marianna13/mattermodeling-stackexchange")
questions_w_answer = []
df = dataset["train"].to_pandas()
# we do the following, if there is no answer, we drop this question
# if there is one answer, we keep it
# if there are multiple we keep the ones that do not have a score of 0
# the answers are in an array of arrays, the first element is the answer, the second is the score
# we then also only keep two columns, the question and the answer, both as string on which we also
# call the strip function to remove leading and trailing whitespaces

for i, row in df.iterrows():
if len(row["answers"]) == 0:
continue
if len(row["answers"]) == 1:
questions_w_answer.append(
[
remove_repeated_almost_empty_lines(row["question_text"].strip()),
remove_repeated_almost_empty_lines(row["answers"][0][0].strip()),
]
)
else:
for answer in row["answers"]:
if answer[1] != 0:
questions_w_answer.append(
[
remove_repeated_almost_empty_lines(
row["question_text"].strip()
),
remove_repeated_almost_empty_lines(answer[0].strip()),
]
)
break

# we then create a dataframe from the list of questions and answers
df_qa = pd.DataFrame(questions_w_answer, columns=["q", "a"])
df_qa.to_csv("data_clean.csv", index=False)


if __name__ == "__main__":
get_clean_df()

0 comments on commit 35d9cd0

Please sign in to comment.