s-nlp · MihailSalnikov · Jul 23, 2023
diff --git a/experiments/subgraphs_datasets_prepare_input_data/mintaka.ipynb b/experiments/subgraphs_datasets_prepare_input_data/mintaka.ipynb
@@ -4,28 +4,7 @@
    "cell_type": "code",
    "execution_count": 1,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/usr/local/lib/python3.8/dist-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
-      "  from .autonotebook import tqdm as notebook_tqdm\n",
-      "2023-04-06 14:53:59.996607: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA\n",
-      "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
-      "2023-04-06 14:54:00.132818: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
-      "2023-04-06 14:54:00.165111: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
-      "2023-04-06 14:54:00.840228: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64\n",
-      "2023-04-06 14:54:00.840311: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64\n",
-      "2023-04-06 14:54:00.840318: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.\n",
-      "2023-04-06 14:54:02.091424: E tensorflow/stream_executor/cuda/cuda_driver.cc:265] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected\n",
-      "2023-04-06 14:54:02.091459: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: 4d7d28cf31fc\n",
-      "2023-04-06 14:54:02.091464: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: 4d7d28cf31fc\n",
-      "2023-04-06 14:54:02.091542: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: NOT_FOUND: was unable to find libcuda.so DSO loaded into this program\n",
-      "2023-04-06 14:54:02.091560: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 510.108.3\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import pandas as pd\n",
     "from pywikidata import Entity\n",
@@ -101,11 +80,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
-    "def prepare_data(data_df, results_df, wd_search_results_top_k: int = 1, mgenre=None, ner=None, entities_selection=None):\n",
+    "def prepare_data(data_df, results_df, wd_search_results_top_k: int = 1, mgenre=None, ner=None, entities_selection=None, test_mode=False):\n",
     "    answers_cols = [c for c in results_df.columns if 'answer_' in c]\n",
     "\n",
     "    results_df['answers_ids'] = results_df[answers_cols].progress_apply(\n",
@@ -119,36 +98,37 @@
     "    df = results_df.merge(data_df, on='question')\n",
     "\n",
     "    for _, row in tqdm(df.iterrows(), total=df.index.size):\n",
-    "        if len(row['answerEntity']) == 0:\n",
-    "            continue\n",
-    "\n",
     "        golden_true_entity = [Entity(e['name']) for e in row['answerEntity']]\n",
     "\n",
-    "        if mgenre is None or ner is None or entities_selection is None:\n",
-    "            question_entity = [\n",
-    "                Entity(e['name'])\n",
-    "                for e in row['questionEntity']\n",
-    "                if e['entityType'] == 'entity' and e['name'] not in [None, 'None', ''] and e['name'][0] == 'Q'\n",
-    "            ]\n",
-    "            additional_candidates = []\n",
-    "            for qe in question_entity:\n",
-    "                for _,e in qe.forward_one_hop_neighbours:\n",
-    "                    if e not in golden_true_entity:\n",
-    "                        additional_candidates.append(e.idx)\n",
-    "            random.shuffle(additional_candidates)\n",
-    "            additional_candidates = additional_candidates[:5]\n",
+    "        question_entity = [\n",
+    "            Entity(e['name'])\n",
+    "            for e in row['questionEntity']\n",
+    "            if e['entityType'] == 'entity' and e['name'] not in [None, 'None', ''] and e['name'][0] == 'Q'\n",
+    "        ]\n",
+    "        \n",
+    "        if test_mode:\n",
+    "            candidates_ids = row['answers_ids']\n",
     "        else:\n",
-    "            question_with_ner, entities_list = ner.entity_labeling(row['question'], True)\n",
-    "            mgenre_results = mgenre(question_with_ner)\n",
-    "            selected_entities = entities_selection(entities_list, mgenre_results)\n",
+    "            if mgenre is None or ner is None or entities_selection is None:\n",
+    "                additional_candidates = []\n",
+    "                for qe in question_entity:\n",
+    "                    for _,e in qe.forward_one_hop_neighbours:\n",
+    "                        if e not in golden_true_entity:\n",
+    "                            additional_candidates.append(e.idx)\n",
+    "                random.shuffle(additional_candidates)\n",
+    "                additional_candidates = additional_candidates[:5]\n",
+    "            else:\n",
+    "                question_with_ner, entities_list = ner.entity_labeling(row['question'], True)\n",
+    "                mgenre_results = mgenre(question_with_ner)\n",
+    "                selected_entities = entities_selection(entities_list, mgenre_results)\n",
     "\n",
-    "            questionEntity = list(itertools.chain(*[\n",
-    "                get_wd_search_results(l, 1, language='en')[:1]\n",
-    "                for l in selected_entities\n",
-    "            ]))\n",
-    "            additional_candidates = []\n",
+    "                question_entity = list(itertools.chain(*[\n",
+    "                    get_wd_search_results(l, 1, language='en')[:1]\n",
+    "                    for l in selected_entities\n",
+    "                ]))\n",
+    "                additional_candidates = []\n",
     "\n",
-    "        candidates_ids = set(additional_candidates + row['answers_ids'] + [e.idx for e in golden_true_entity])\n",
+    "            candidates_ids = set(additional_candidates + row['answers_ids'] + [e.idx for e in golden_true_entity])\n",
     "        \n",
     "        for candidate_id in candidates_ids:\n",
     "            candidate_entity = Entity(candidate_id)\n",
@@ -171,10 +151,23 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Reusing dataset mintaka (/root/.cache/huggingface/datasets/AmazonScience___mintaka/en/1.0.0/bb35d95f07aed78fa590601245009c5f585efe909dbd4a8f2a4025ccf65bb11d)\n",
-      "100%|██████████| 3/3 [00:00<00:00, 267.76it/s]\n"
+      "Found cached dataset mintaka (/Users/ms/.cache/huggingface/datasets/AmazonScience___mintaka/en/1.0.0/bb35d95f07aed78fa590601245009c5f585efe909dbd4a8f2a4025ccf65bb11d)\n"
      ]
     },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "00fcc9682c374d8eb34158b9004bb915",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/3 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
     {
      "data": {
       "text/plain": [
@@ -250,26 +243,56 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|██████████| 4000/4000 [00:06<00:00, 591.16it/s]\n",
-      "100%|██████████| 4000/4000 [00:03<00:00, 1331.76it/s]\n"
-     ]
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5d336c8e3d55425ab5b2262e97502399",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/4000 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "bfd6fe089f814f7a91527c4fb293c38d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/4000 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     }
    ],
    "source": [
     "results_test_df: pd.DataFrame = pd.read_csv('./mintaka_results_test.csv') # test\n",
     "\n",
     "with open('to_subgraphs/mintaka_test.jsonl', 'w') as f:\n",
-    "    for data_line in prepare_data(ds['test'].to_pandas(), results_test_df):\n",
+    "    for data_line in prepare_data(ds['test'].to_pandas(), results_test_df, test_mode=True):\n",
     "        f.write(ujson.dumps(data_line)+'\\n')"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_json('./to_subgraphs/mintaka_test.jsonl', lines=True)\n",
+    "df['id'].unique().shape"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -294,7 +317,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.10"
+   "version": "3.10.8"
   },
   "orig_nbformat": 4
  },