Skip to content

Commit

Permalink
Add clickhouse provisioning code
Browse files Browse the repository at this point in the history
Moved from the pvannierop/cbioportal-clickhouse-pilot repository.
  • Loading branch information
pvannierop committed Oct 25, 2023
1 parent a317afe commit 25f780a
Show file tree
Hide file tree
Showing 3 changed files with 221 additions and 0 deletions.
36 changes: 36 additions & 0 deletions dev/clickhouse/DEV_SETUP.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Setup for development on Clickhouse integration

## Pre-requisites

1. cBioPortal database:

- provisioned with one or more study in the _cbioportal_ database.
- user: _cbio_ password: _P@ssword1_ with all permissions on the _cbioportal_ database.
- available on port 3306 on the host system.

3. System with docker and docker compose installed.

## Setup

All commands start from the root repository location.

1. Start cBioPortal database.
2. Create MySQL views in the cBioPortal database by running the commands
in [cbio_database_views.sql](mysql_provisioning/cbio_database_views.sql).
3. Start Clickhouse (provisioned automatically).

```
cd ./dev/clickhouse
docker compose up -d
```

This will start a Clickhouse instance that is available on the host system:
- port: _8123_
- database: _cbioportal_
- username: _cbio_
- password: _P@ssword1_

## TODO

- Extend [cbio_database_views.sql](mysql_provisioning/cbio_database_views.sql)
and [c_import_mysql_data.sql](clickhouse_provisioning/c_import_mysql_data.sql) to cover all clickhouse tables.
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
insert into cbioportal.sample_in_genetic_profile
select * from mysql(
'127.0.0.1:3306',
'cbioportal',
'view_sample_in_genetic_profile',
'cbio',
'P@ssword1'
);
177 changes: 177 additions & 0 deletions dev/clickhouse/mysql_provisioning/cbio_database_views.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
# sample
DROP VIEW IF EXISTS view_sample;
CREATE VIEW view_sample AS
SELECT
concat(cs.CANCER_STUDY_IDENTIFIER, '_', sample.STABLE_ID) as sample_unique_id,
sample.STABLE_ID as sample_stable_id,
concat(cs.CANCER_STUDY_IDENTIFIER, '_', p.STABLE_ID) as patient_unique_id,
p.STABLE_ID as patient_stable_id,
cs.CANCER_STUDY_IDENTIFIER as cancer_study_identifier
FROM sample
INNER JOIN patient p on sample.PATIENT_ID = p.INTERNAL_ID
INNER JOIN cancer_study cs on p.CANCER_STUDY_ID = cs.CANCER_STUDY_ID;

# sample list
DROP VIEW IF EXISTS view_sample_list;
CREATE VIEW view_sample_list AS
SELECT
concat(cs.CANCER_STUDY_IDENTIFIER, '_', s.STABLE_ID) as sample_unique_id,
sl.STABLE_ID as sample_list_stable_id,
sl.NAME as name,
cs.CANCER_STUDY_IDENTIFIER as cancer_study_identifier
FROM sample_list as sl
INNER JOIN sample_list_list as sll ON sll.LIST_ID = sl.LIST_ID
INNER JOIN sample as s ON s.INTERNAL_ID = sll.SAMPLE_ID
INNER JOIN cancer_study cs on sl.CANCER_STUDY_ID = cs.CANCER_STUDY_ID;

# genomic_event
DROP TABLE IF EXISTS view_genomic_event;
-- This view takes a long time to materialize. I store the data in a table to prevent repeated recalculations.
# CREATE TABLE view_genomic_event AS
# SELECT
# concat(cs.CANCER_STUDY_IDENTIFIER, '_', sample.STABLE_ID) as sample_unique_id,
# gene.HUGO_GENE_SYMBOL as hugo_gene_symbol,
# me.PROTEIN_CHANGE as variant,
# gp.STABLE_ID as gene_panel_stable_id,
# cs.CANCER_STUDY_IDENTIFIER as cancer_study_identifier,
# g.STABLE_ID as genetic_profile_stable_id
# FROM mutation
# LEFT JOIN mutation_event as me ON mutation.MUTATION_EVENT_ID = me.MUTATION_EVENT_ID
# LEFT JOIN sample_profile sp on mutation.SAMPLE_ID = sp.SAMPLE_ID and mutation.GENETIC_PROFILE_ID = sp.GENETIC_PROFILE_ID
# LEFT JOIN gene_panel gp on sp.PANEL_ID = gp.INTERNAL_ID
# LEFT JOIN genetic_profile g on sp.GENETIC_PROFILE_ID = g.GENETIC_PROFILE_ID
# LEFT JOIN cancer_study cs on g.CANCER_STUDY_ID = cs.CANCER_STUDY_ID
# LEFT JOIN sample on mutation.SAMPLE_ID = sample.INTERNAL_ID
# LEFT JOIN gene ON mutation.ENTREZ_GENE_ID = gene.ENTREZ_GENE_ID;
# INSERT INTO view_genomic_event
# SELECT
# concat(cs.CANCER_STUDY_IDENTIFIER, '_', sample.STABLE_ID) as sample_unique_id,
# gene.HUGO_GENE_SYMBOL as hugo_gene_symbol,
# convert(ce.ALTERATION, char) as variant,
# gene_panel.STABLE_ID as gene_panel_stable_id,
# cs.CANCER_STUDY_IDENTIFIER as cancer_study_identifier,
# gp.STABLE_ID as genetic_profile_stable_id
# FROM sample_cna_event
# LEFT JOIN cna_event ce on sample_cna_event.CNA_EVENT_ID = ce.CNA_EVENT_ID
# LEFT JOIN gene on ce.ENTREZ_GENE_ID = gene.ENTREZ_GENE_ID
# LEFT JOIN genetic_profile gp on sample_cna_event.GENETIC_PROFILE_ID = gp.GENETIC_PROFILE_ID
# LEFT JOIN sample_profile sp on gp.GENETIC_PROFILE_ID = sp.GENETIC_PROFILE_ID
# LEFT JOIN cancer_study cs on gp.CANCER_STUDY_ID = cs.CANCER_STUDY_ID
# LEFT JOIN sample on sample_cna_event.SAMPLE_ID = sample.INTERNAL_ID
# LEFT JOIN gene_panel ON sp.PANEL_ID = gene_panel.INTERNAL_ID;
# INSERT INTO view_genomic_event
# SELECT
# concat(cs.CANCER_STUDY_IDENTIFIER, '_', s.STABLE_ID) as sample_unique_id,
# hugo_gene_symbol,
# Event_Info as variant,
# g.STABLE_ID as gene_panel_stable_id,
# cs.CANCER_STUDY_IDENTIFIER as cancer_study_identifier,
# gp.STABLE_ID as genetic_profile_stable_id
# FROM structural_variant as sv
# LEFT JOIN (SELECT ENTREZ_GENE_ID, HUGO_GENE_SYMBOL as hugo_gene_symbol FROM gene) gene1 on gene1.ENTREZ_GENE_ID = sv.SITE1_ENTREZ_GENE_ID
# LEFT OUTER JOIN genetic_profile gp on gp.GENETIC_PROFILE_ID = sv.GENETIC_PROFILE_ID
# LEFT JOIN sample s on sv.SAMPLE_ID = s.INTERNAL_ID
# LEFT JOIN cancer_study cs on gp.CANCER_STUDY_ID = cs.CANCER_STUDY_ID
# LEFT JOIN sample_profile sp on gp.GENETIC_PROFILE_ID = sp.GENETIC_PROFILE_ID
# LEFT JOIN gene_panel g on sp.PANEL_ID = g.INTERNAL_ID;
# INSERT INTO view_genomic_event
# SELECT
# concat(cs.CANCER_STUDY_IDENTIFIER, '_', s.STABLE_ID) as sample_unique_id,
# hugo_gene_symbol,
# Event_Info as variant,
# g.STABLE_ID as gene_panel_stable_id,
# cs.CANCER_STUDY_IDENTIFIER as cancer_study_identifier,
# gp.STABLE_ID as genetic_profile_stable_id
# FROM structural_variant as sv
# LEFT JOIN (SELECT ENTREZ_GENE_ID, HUGO_GENE_SYMBOL as hugo_gene_symbol FROM gene) gene2 on gene2.ENTREZ_GENE_ID = sv.SITE2_ENTREZ_GENE_ID
# LEFT OUTER JOIN genetic_profile gp on gp.GENETIC_PROFILE_ID = sv.GENETIC_PROFILE_ID
# LEFT JOIN sample s on sv.SAMPLE_ID = s.INTERNAL_ID
# LEFT JOIN cancer_study cs on gp.CANCER_STUDY_ID = cs.CANCER_STUDY_ID
# LEFT JOIN sample_profile sp on gp.GENETIC_PROFILE_ID = sp.GENETIC_PROFILE_ID
# LEFT JOIN gene_panel g on sp.PANEL_ID = g.INTERNAL_ID;

-- structural variant
DROP VIEW IF EXISTS view_structural_variant;
CREATE VIEW view_structural_variant AS
SELECT
concat(cs.CANCER_STUDY_IDENTIFIER, '_', s.STABLE_ID) as sample_unique_id,
gene1.HUGO_GENE_SYMBOL as hugo_symbol_gene1,
gene2.HUGO_GENE_SYMBOL as hugo_symbol_gene2,
g.STABLE_ID as gene_panel_stable_id,
cs.CANCER_STUDY_IDENTIFIER as cancer_study_identifier,
gp.STABLE_ID as genetic_profile_stable_id
FROM structural_variant as sv
LEFT JOIN (SELECT ENTREZ_GENE_ID, HUGO_GENE_SYMBOL FROM gene) gene1 on gene1.ENTREZ_GENE_ID = sv.SITE1_ENTREZ_GENE_ID
LEFT JOIN (SELECT ENTREZ_GENE_ID, HUGO_GENE_SYMBOL FROM gene) gene2 on gene2.ENTREZ_GENE_ID = sv.SITE2_ENTREZ_GENE_ID
LEFT OUTER JOIN genetic_profile gp on gp.GENETIC_PROFILE_ID = sv.GENETIC_PROFILE_ID
LEFT JOIN sample s on sv.SAMPLE_ID = s.INTERNAL_ID
LEFT JOIN cancer_study cs on gp.CANCER_STUDY_ID = cs.CANCER_STUDY_ID
LEFT JOIN sample_profile sp on gp.GENETIC_PROFILE_ID = sp.GENETIC_PROFILE_ID
LEFT JOIN gene_panel g on sp.PANEL_ID = g.INTERNAL_ID;

-- sample_clinical_attribute_numeric
DROP VIEW IF EXISTS view_sample_clinical_attribute_numeric;
CREATE VIEW view_sample_clinical_attribute_numeric AS
SELECT
concat(cs.CANCER_STUDY_IDENTIFIER, '_', s.STABLE_ID) as sample_unique_id,
concat(cs.CANCER_STUDY_IDENTIFIER, '_', p.STABLE_ID) as patient_unique_id,
ATTR_ID as attribute_name,
ATTR_VALUE as attribute_value,
cs.CANCER_STUDY_IDENTIFIER as cancer_study_identifier
FROM cancer_study cs
INNER JOIN patient p on cs.CANCER_STUDY_ID = p.CANCER_STUDY_ID
INNER JOIN sample s on p.INTERNAL_ID = s.PATIENT_ID
INNER JOIN clinical_sample cs on s.INTERNAL_ID = cs.INTERNAL_ID
WHERE ATTR_VALUE REGEXP '^[0-9.]+$';

-- sample_clinical_attribute_categorical
DROP VIEW IF EXISTS view_sample_clinical_attribute_categorical;
CREATE VIEW view_sample_clinical_attribute_categorical AS
SELECT
concat(cs.CANCER_STUDY_IDENTIFIER, '_', s.STABLE_ID) as sample_unique_id,
concat(cs.CANCER_STUDY_IDENTIFIER, '_', p.STABLE_ID) as patient_unique_id,
ATTR_ID as attribute_name,
ATTR_VALUE as attribute_value,
cs.CANCER_STUDY_IDENTIFIER as cancer_study_identifier
FROM cancer_study cs
INNER JOIN patient p on cs.CANCER_STUDY_ID = p.CANCER_STUDY_ID
INNER JOIN sample s on p.INTERNAL_ID = s.PATIENT_ID
INNER JOIN clinical_sample cs on s.INTERNAL_ID = cs.INTERNAL_ID
WHERE ATTR_VALUE NOT REGEXP '^[0-9.]+$';

-- patient_clinical_attribute_numeric
DROP VIEW IF EXISTS view_patient_clinical_attribute_numeric;
CREATE VIEW view_patient_clinical_attribute_numeric AS
SELECT
concat(cs.CANCER_STUDY_IDENTIFIER, '_', p.STABLE_ID) as patient_unique_id,
ATTR_ID as attribute_name,
ATTR_VALUE as attribute_value,
cs.CANCER_STUDY_IDENTIFIER as cancer_study_identifier
FROM cancer_study cs
INNER JOIN patient p on cs.CANCER_STUDY_ID = p.CANCER_STUDY_ID
INNER JOIN clinical_patient cp on p.INTERNAL_ID = cp.INTERNAL_ID
WHERE ATTR_VALUE REGEXP '^[0-9.]+$';

-- patient_clinical_attribute_categorical
DROP VIEW IF EXISTS view_patient_clinical_attribute_categorical;
CREATE VIEW view_patient_clinical_attribute_categorical AS
SELECT
concat(cs.CANCER_STUDY_IDENTIFIER, '_', p.STABLE_ID) as patient_unique_id,
ATTR_ID as attribute_name,
ATTR_VALUE as attribute_value,
cs.CANCER_STUDY_IDENTIFIER as cancer_study_identifier
FROM cancer_study cs
INNER JOIN patient p on cs.CANCER_STUDY_ID = p.CANCER_STUDY_ID
INNER JOIN clinical_patient cp on p.INTERNAL_ID = cp.INTERNAL_ID
WHERE ATTR_VALUE NOT REGEXP '^[0-9.]+$';

-- sample_in_genomic_profile
DROP VIEW IF EXISTS view_sample_in_genetic_profile;
CREATE VIEW view_sample_in_genetic_profile AS
SELECT
concat(cs.CANCER_STUDY_IDENTIFIER, '_', sample.STABLE_ID) as sample_unique_id,
replace(gp.STABLE_ID, concat(cs.CANCER_STUDY_IDENTIFIER, '_'), '') as genetic_profile_stable_id_short
FROM sample
INNER JOIN sample_profile sp on sample.INTERNAL_ID = sp.SAMPLE_ID
INNER JOIN genetic_profile gp on sp.GENETIC_PROFILE_ID = gp.GENETIC_PROFILE_ID
INNER JOIN cancer_study cs on gp.CANCER_STUDY_ID = cs.CANCER_STUDY_ID;

0 comments on commit 25f780a

Please sign in to comment.