diff --git a/experiments/subgraphs_datasets_prepare_input_data/mintaka.ipynb b/experiments/subgraphs_datasets_prepare_input_data/mintaka.ipynb index e83a360..6b4add2 100644 --- a/experiments/subgraphs_datasets_prepare_input_data/mintaka.ipynb +++ b/experiments/subgraphs_datasets_prepare_input_data/mintaka.ipynb @@ -4,28 +4,7 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.8/dist-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n", - "2023-04-06 14:53:59.996607: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 AVX512F AVX512_VNNI FMA\n", - "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", - "2023-04-06 14:54:00.132818: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n", - "2023-04-06 14:54:00.165111: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n", - "2023-04-06 14:54:00.840228: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64\n", - "2023-04-06 14:54:00.840311: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64\n", - "2023-04-06 14:54:00.840318: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.\n", - "2023-04-06 14:54:02.091424: E tensorflow/stream_executor/cuda/cuda_driver.cc:265] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected\n", - "2023-04-06 14:54:02.091459: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: 4d7d28cf31fc\n", - "2023-04-06 14:54:02.091464: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: 4d7d28cf31fc\n", - "2023-04-06 14:54:02.091542: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: NOT_FOUND: was unable to find libcuda.so DSO loaded into this program\n", - "2023-04-06 14:54:02.091560: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 510.108.3\n" - ] - } - ], + "outputs": [], "source": [ "import pandas as pd\n", "from pywikidata import Entity\n", @@ -101,11 +80,11 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ - "def prepare_data(data_df, results_df, wd_search_results_top_k: int = 1, mgenre=None, ner=None, entities_selection=None):\n", + "def prepare_data(data_df, results_df, wd_search_results_top_k: int = 1, mgenre=None, ner=None, entities_selection=None, test_mode=False):\n", " answers_cols = [c for c in results_df.columns if 'answer_' in c]\n", "\n", " results_df['answers_ids'] = results_df[answers_cols].progress_apply(\n", @@ -119,36 +98,37 @@ " df = results_df.merge(data_df, on='question')\n", "\n", " for _, row in tqdm(df.iterrows(), total=df.index.size):\n", - " if len(row['answerEntity']) == 0:\n", - " continue\n", - "\n", " golden_true_entity = [Entity(e['name']) for e in row['answerEntity']]\n", "\n", - " if mgenre is None or ner is None or entities_selection is None:\n", - " question_entity = [\n", - " Entity(e['name'])\n", - " for e in row['questionEntity']\n", - " if e['entityType'] == 'entity' and e['name'] not in [None, 'None', ''] and e['name'][0] == 'Q'\n", - " ]\n", - " additional_candidates = []\n", - " for qe in question_entity:\n", - " for _,e in qe.forward_one_hop_neighbours:\n", - " if e not in golden_true_entity:\n", - " additional_candidates.append(e.idx)\n", - " random.shuffle(additional_candidates)\n", - " additional_candidates = additional_candidates[:5]\n", + " question_entity = [\n", + " Entity(e['name'])\n", + " for e in row['questionEntity']\n", + " if e['entityType'] == 'entity' and e['name'] not in [None, 'None', ''] and e['name'][0] == 'Q'\n", + " ]\n", + " \n", + " if test_mode:\n", + " candidates_ids = row['answers_ids']\n", " else:\n", - " question_with_ner, entities_list = ner.entity_labeling(row['question'], True)\n", - " mgenre_results = mgenre(question_with_ner)\n", - " selected_entities = entities_selection(entities_list, mgenre_results)\n", + " if mgenre is None or ner is None or entities_selection is None:\n", + " additional_candidates = []\n", + " for qe in question_entity:\n", + " for _,e in qe.forward_one_hop_neighbours:\n", + " if e not in golden_true_entity:\n", + " additional_candidates.append(e.idx)\n", + " random.shuffle(additional_candidates)\n", + " additional_candidates = additional_candidates[:5]\n", + " else:\n", + " question_with_ner, entities_list = ner.entity_labeling(row['question'], True)\n", + " mgenre_results = mgenre(question_with_ner)\n", + " selected_entities = entities_selection(entities_list, mgenre_results)\n", "\n", - " questionEntity = list(itertools.chain(*[\n", - " get_wd_search_results(l, 1, language='en')[:1]\n", - " for l in selected_entities\n", - " ]))\n", - " additional_candidates = []\n", + " question_entity = list(itertools.chain(*[\n", + " get_wd_search_results(l, 1, language='en')[:1]\n", + " for l in selected_entities\n", + " ]))\n", + " additional_candidates = []\n", "\n", - " candidates_ids = set(additional_candidates + row['answers_ids'] + [e.idx for e in golden_true_entity])\n", + " candidates_ids = set(additional_candidates + row['answers_ids'] + [e.idx for e in golden_true_entity])\n", " \n", " for candidate_id in candidates_ids:\n", " candidate_entity = Entity(candidate_id)\n", @@ -171,10 +151,23 @@ "name": "stderr", "output_type": "stream", "text": [ - "Reusing dataset mintaka (/root/.cache/huggingface/datasets/AmazonScience___mintaka/en/1.0.0/bb35d95f07aed78fa590601245009c5f585efe909dbd4a8f2a4025ccf65bb11d)\n", - "100%|██████████| 3/3 [00:00<00:00, 267.76it/s]\n" + "Found cached dataset mintaka (/Users/ms/.cache/huggingface/datasets/AmazonScience___mintaka/en/1.0.0/bb35d95f07aed78fa590601245009c5f585efe909dbd4a8f2a4025ccf65bb11d)\n" ] }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "00fcc9682c374d8eb34158b9004bb915", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/3 [00:00