diff --git a/data/tabular/mattermodeling_stackexchange/meta.yaml b/data/tabular/mattermodeling_stackexchange/meta.yaml new file mode 100644 index 000000000..cab8c6df1 --- /dev/null +++ b/data/tabular/mattermodeling_stackexchange/meta.yaml @@ -0,0 +1,24 @@ +--- +name: mattermodeling_stackexchange +description: |- + Questions and answers mined from mattermodeling.stackexchange.com. +targets: + - id: a + description: answer to the question + type: string +identifiers: + - id: q + type: string + description: question asked on mattermodeling.stackexchange.com +license: BSD-3-Clause +links: + - url: mattermodeling.stackexchange.com + description: original data source + - url: https://stackoverflow.com/help/licensing + description: information about the license +num_points: 2571 +templates: + - |- + {#Task: Please answer the question of the user.|Task: Provide a detailed response to the user's question.|Task: Address the user's query with a well-structured answer.|Task: Your role is to respond to the user's question with clarity.|Task: Offer a concise and informative answer to the user's question.|Task: Provide a clear and concise reply to the user's inquiry.!} + {#User: |Question: |Inquiry: |\n!}{#q} + {#Assistant: |Answer: !}{#a} \ No newline at end of file diff --git a/data/tabular/mattermodeling_stackexchange/transform.py b/data/tabular/mattermodeling_stackexchange/transform.py new file mode 100644 index 000000000..f7ad7b7ec --- /dev/null +++ b/data/tabular/mattermodeling_stackexchange/transform.py @@ -0,0 +1,64 @@ +from datasets import load_dataset +import pandas as pd + + +def remove_repeated_almost_empty_lines(text): + # in text remove lines that are almost empty and repeated + # almost empty means that there is only punctuation, or special characters or spaces + # repeated means that the line is the same as the previous one + # return the text without the repeated almost empty lines + # found by manual inspection that this is an issue in some of the raw text + lines = text.split("\n") + new_lines = [] + previous_line = "" + for line in lines: + if line.strip() == "": + continue + if line.strip() == previous_line: + continue + new_lines.append(line) + previous_line = line.strip() + return "\n".join(new_lines) + + +def get_clean_df(): + dataset = load_dataset("marianna13/mattermodeling-stackexchange") + questions_w_answer = [] + df = dataset["train"].to_pandas() + # we do the following, if there is no answer, we drop this question + # if there is one answer, we keep it + # if there are multiple we keep the ones that do not have a score of 0 + # the answers are in an array of arrays, the first element is the answer, the second is the score + # we then also only keep two columns, the question and the answer, both as string on which we also + # call the strip function to remove leading and trailing whitespaces + + for i, row in df.iterrows(): + if len(row["answers"]) == 0: + continue + if len(row["answers"]) == 1: + questions_w_answer.append( + [ + remove_repeated_almost_empty_lines(row["question_text"].strip()), + remove_repeated_almost_empty_lines(row["answers"][0][0].strip()), + ] + ) + else: + for answer in row["answers"]: + if answer[1] != 0: + questions_w_answer.append( + [ + remove_repeated_almost_empty_lines( + row["question_text"].strip() + ), + remove_repeated_almost_empty_lines(answer[0].strip()), + ] + ) + break + + # we then create a dataframe from the list of questions and answers + df_qa = pd.DataFrame(questions_w_answer, columns=["q", "a"]) + df_qa.to_csv("data_clean.csv", index=False) + + +if __name__ == "__main__": + get_clean_df()