diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..6794eec --- /dev/null +++ b/Dockerfile @@ -0,0 +1,31 @@ +FROM python:3.10-slim + +LABEL author="Terézia Slanináková" +LABEL email="slaninakova@mail.muni.cz" +LABEL website="https://disa.fi.muni.cz/complex-data-analysis/" + +# Set the version of the image to use, default: cpu +ARG version=cpu + +# Install linux packages +RUN apt-get update && apt-get install vim -y + +# Install required python packages +COPY requirements-${version}.txt /tmp/ +COPY requirements.txt /tmp/ +RUN pip install --upgrade pip +RUN pip install --no-cache-dir -r /tmp/requirements-${version}.txt + +# Create user, make it the owner of the home directory +RUN addgroup --gid 1000 user && adduser --gid 1000 --uid 1000 --disabled-password --gecos user user +USER root +RUN chown -R user:user /home/user && chmod -R 755 /home/user + +# Copy the files from the host to the container and install the local package +COPY . /home/user +RUN pip install -e /home/user + +USER user +WORKDIR /home/user + +CMD ['/bin/sh', '-c', 'bash'] \ No newline at end of file diff --git a/README.md b/README.md index 22eb69f..8202bff 100644 --- a/README.md +++ b/README.md @@ -2,40 +2,49 @@ Learned Metric Index (LMI) is an index for approximate nearest neighbor search on complex data using machine learning and probability-based navigation. - # Getting started See examples of how to index and search in a dataset in: [01_Introduction.ipynb](01_Introduction.ipynb) notebook. ## Installation -See also ``.github/workflows/ci.yml`` - -### Using conda +### Using virtualenv ```bash -conda create -n env python=3.8 -conda activate env -conda install matplotlib pandas scikit-learn jupyterlab -pip install h5py flake8 setuptools tqdm faiss-cpu -pip install torch --index-url https://download.pytorch.org/whl/cpu +# 1) Clone the repo with submodules +git clone --recursive git@github.com:LearnedMetricIndex/LearnedMetricIndex.git +# 2) Create and activate a new virtual environment +python -m venv lmi-env +source lmi-env/bin/activate +# 3) Install the dependencies +pip install -r requirements-cpu.txt # alternatively requirements-gpu.txt pip install --editable . ``` -## Running +### Using docker -```bash -jupyter-lab -# and open 01_Introduction.ipynb +Requirements: +- [Docker](https://docs.docker.com/get-docker/) +- At least 1.5 gb disk space for the CPU and up to 5.5 gb for the GPU version -# or -python3 search/search.py +```bash +# 1) Clone the repo with submodules +git clone --recursive git@github.com:LearnedMetricIndex/LearnedMetricIndex.git +# 2) Build the docker image (CPU version) +docker build -t lmi -f Dockerfile --build-arg version=cpu . +# alternatively: docker build -t lmi -f Dockerfile --build-arg version=gpu . +# 3) Run the docker image +docker run -p 8888:8888 -it lmi bash ``` -## Evaluation +## Running ```bash -python3 eval/eval.py -python3 eval/plot.py res.csv +# Run jupyterlab, copy the outputted url into the browser and open 01_Introduction.ipynb +jupyter-lab --ip 0.0.0.0 --no-browser + +# Run the search on 100k data subset, evaluate the results and plot them. +# Expected time to run = ~5-10 mins +python3 search/search.py && python eval/eval.py && python eval/plot.py res.csv ``` ## Performance @@ -64,7 +73,8 @@ python3 eval/plot.py res.csv - ~6h of runtime (waries depending on the hardware) # LMI in action -🌐 [**Similarity search in 214M protein structures (AlphaFold DB)**](https://alphafind.fi.muni.cz/search) + +- 🌐 [**Similarity search in 214M protein structures (AlphaFold DB)**](https://alphafind.fi.muni.cz/) # Publications @@ -86,7 +96,7 @@ python3 eval/plot.py res.csv - [**Web**](https://alphafind.fi.muni.cz/search) - [**Repository**](https://github.com/Coda-Research-Group/AlphaFind) - [**Data**](https://data.narodni-repozitar.cz/general/datasets/d35zf-1ja47) -> PROCHÁZKA, David, Terézia SLANINÁKOVÁ, Jaroslav OĽHA, Adrián ROŠINEC, Katarína GREŠOVÁ, Miriama JÁNOŠOVÁ, Jakub ČILLÍK, Jana PORUBSKÁ, Radka SVOBODOVÁ, Vlastislav DOHNAL a Matej ANTOL.: [AlphaFind: Discover structure similarity across the entire known proteome](https://www.biorxiv.org/content/10.1101/2024.02.15.580465v1). BioRxiv (pre-print version) +> Procházka, D., Slanináková, T., Oľha, J., Rošinec, A., Grešová, K., Jánošová, M., Čillík, J., Porubská, J., Svobodová, R., Dohnal, V., & Antol, M. (2024). [AlphaFind: discover structure similarity across the proteome in AlphaFold DB](https://academic.oup.com/nar/article/52/W1/W182/7673488). Nucleic Acids Research. ## Team diff --git a/requirements-cpu.txt b/requirements-cpu.txt new file mode 100644 index 0000000..bdb681f --- /dev/null +++ b/requirements-cpu.txt @@ -0,0 +1,3 @@ +-r requirements.txt +--extra-index-url=https://download.pytorch.org/whl/cpu +torch==2.1.1 \ No newline at end of file diff --git a/requirements-gpu.txt b/requirements-gpu.txt new file mode 100644 index 0000000..6d914b2 --- /dev/null +++ b/requirements-gpu.txt @@ -0,0 +1,2 @@ +-r requirements.txt +torch==2.1.1 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..da0b0c3 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,10 @@ +numpy==1.26.2 +matplotlib==3.8.1 +pandas==2.1.3 +scikit-learn==1.3.2 +h5py==3.10.0 +flake8==6.1.0 +tqdm==4.66.1 +faiss-cpu==1.7.4 +setuptools==60.7.0 +jupyterlab==4.0.8 \ No newline at end of file diff --git a/search/li/clustering/faiss_kmeans.py b/search/li/clustering/faiss_kmeans.py index 73c3c70..6026b76 100644 --- a/search/li/clustering/faiss_kmeans.py +++ b/search/li/clustering/faiss_kmeans.py @@ -16,6 +16,7 @@ def cluster( _, d = data.shape kmeans = Kmeans(d=d, k=n_clusters, **parameters) + data = np.ascontiguousarray(data.astype(np.float32)) kmeans.train(data) labels = kmeans.index.search(data, 1)[1].T[0] # type: ignore