add stackexchange data

OpenBioML · Oct 13, 2023 · 35d9cd0 · 35d9cd0
1 parent 18710d5
commit 35d9cd0
Show file tree

Hide file tree

Showing 2 changed files with 88 additions and 0 deletions.
diff --git a/data/tabular/mattermodeling_stackexchange/meta.yaml b/data/tabular/mattermodeling_stackexchange/meta.yaml
@@ -0,0 +1,24 @@
+---
+name: mattermodeling_stackexchange
+description: |-
+    Questions and answers mined from mattermodeling.stackexchange.com.
+targets:
+    - id: a
+      description: answer to the question
+      type: string
+identifiers:
+    - id: q
+      type: string
+      description: question asked on mattermodeling.stackexchange.com
+license: BSD-3-Clause
+links:
+    - url:  mattermodeling.stackexchange.com
+      description: original data source
+    - url: https://stackoverflow.com/help/licensing
+      description: information about the license
+num_points: 2571
+templates:
+    - |-
+      {#Task: Please answer the question of the user.|Task: Provide a detailed response to the user's question.|Task: Address the user's query with a well-structured answer.|Task: Your role is to respond to the user's question with clarity.|Task: Offer a concise and informative answer to the user's question.|Task: Provide a clear and concise reply to the user's inquiry.!}
+      {#User: |Question: |Inquiry: |\n!}{#q}
+      {#Assistant: |Answer: !}{#a}
diff --git a/data/tabular/mattermodeling_stackexchange/transform.py b/data/tabular/mattermodeling_stackexchange/transform.py
@@ -0,0 +1,64 @@
+from datasets import load_dataset
+import pandas as pd
+
+
+def remove_repeated_almost_empty_lines(text):
+    # in text remove lines that are almost empty and repeated
+    # almost empty means that there is only punctuation, or special characters or spaces
+    # repeated means that the line is the same as the previous one
+    # return the text without the repeated almost empty lines
+    # found by manual inspection that this is an issue in some of the raw text
+    lines = text.split("\n")
+    new_lines = []
+    previous_line = ""
+    for line in lines:
+        if line.strip() == "":
+            continue
+        if line.strip() == previous_line:
+            continue
+        new_lines.append(line)
+        previous_line = line.strip()
+    return "\n".join(new_lines)
+
+
+def get_clean_df():
+    dataset = load_dataset("marianna13/mattermodeling-stackexchange")
+    questions_w_answer = []
+    df = dataset["train"].to_pandas()
+    # we do the following, if there is no answer, we drop this question
+    # if there is one answer, we keep it
+    # if there are multiple we keep the ones that do not have a score of 0
+    # the answers are in an array of arrays, the first element is the answer, the second is the score
+    # we then also only keep two columns, the question and the answer, both as string on which we also
+    # call the strip function to remove leading and trailing whitespaces
+
+    for i, row in df.iterrows():
+        if len(row["answers"]) == 0:
+            continue
+        if len(row["answers"]) == 1:
+            questions_w_answer.append(
+                [
+                    remove_repeated_almost_empty_lines(row["question_text"].strip()),
+                    remove_repeated_almost_empty_lines(row["answers"][0][0].strip()),
+                ]
+            )
+        else:
+            for answer in row["answers"]:
+                if answer[1] != 0:
+                    questions_w_answer.append(
+                        [
+                            remove_repeated_almost_empty_lines(
+                                row["question_text"].strip()
+                            ),
+                            remove_repeated_almost_empty_lines(answer[0].strip()),
+                        ]
+                    )
+                    break
+
+    # we then create a dataframe from the list of questions and answers
+    df_qa = pd.DataFrame(questions_w_answer, columns=["q", "a"])
+    df_qa.to_csv("data_clean.csv", index=False)
+
+
+if __name__ == "__main__":
+    get_clean_df()