From c1b940d36ed6153186749482d18ea927f19152c1 Mon Sep 17 00:00:00 2001 From: pvannierop Date: Wed, 25 Oct 2023 15:02:05 +0200 Subject: [PATCH] Cover all clickhouse tables with mysql import --- .../c_import_mysql_data.sql | 107 ++++++++++ .../cbio_database_views.sql | 189 ++++++++++-------- 2 files changed, 208 insertions(+), 88 deletions(-) diff --git a/dev/clickhouse/clickhouse_provisioning/c_import_mysql_data.sql b/dev/clickhouse/clickhouse_provisioning/c_import_mysql_data.sql index f53892b4621..1dc2967f48b 100644 --- a/dev/clickhouse/clickhouse_provisioning/c_import_mysql_data.sql +++ b/dev/clickhouse/clickhouse_provisioning/c_import_mysql_data.sql @@ -6,3 +6,110 @@ select * from mysql( 'cbio', 'P@ssword1' ); + +insert into cbioportal.sample_list +select * from mysql( + '127.0.0.1:3306', + 'cbioportal', + 'view_sample_list', + 'cbio', + 'P@ssword1' + ); + +insert into cbioportal.structural_variant +select * from mysql( + '127.0.0.1:3306', + 'cbioportal', + 'view_structural_variant', + 'cbio', + 'P@ssword1' + ); + +insert into cbioportal.sample +select * from mysql( + '127.0.0.1:3306', + 'cbioportal', + 'view_sample', + 'cbio', + 'P@ssword1' + ); + + +insert into cbioportal.sample_clinical_attribute_numeric +select * from mysql( + '127.0.0.1:3306', + 'cbioportal', + 'view_sample_clinical_attribute_numeric', + 'cbio', + 'P@ssword1' + ); + + +insert into cbioportal.sample_clinical_attribute_categorical +select * from mysql( + '127.0.0.1:3306', + 'cbioportal', + 'view_sample_clinical_attribute_categorical', + 'cbio', + 'P@ssword1' + ); + + +insert into cbioportal.patient_clinical_attribute_numeric +select * from mysql( + '127.0.0.1:3306', + 'cbioportal', + 'view_patient_clinical_attribute_numeric', + 'cbio', + 'P@ssword1' + ); + + + +insert into cbioportal.patient_clinical_attribute_categorical +select * from mysql( + '127.0.0.1:3306', + 'cbioportal', + 'view_patient_clinical_attribute_categorical', + 'cbio', + 'P@ssword1' + ); + + +insert into cbioportal.genomic_event +select * from mysql( + '127.0.0.1:3306', + 'cbioportal', + 'view_genomic_event_mutation', + 'cbio', + 'P@ssword1' + ); + + +insert into cbioportal.genomic_event +select * from mysql( + '127.0.0.1:3306', + 'cbioportal', + 'view_genomic_event_cna', + 'cbio', + 'P@ssword1' + ); + +insert into cbioportal.genomic_event +select * from mysql( + '127.0.0.1:3306', + 'cbioportal', + 'view_genomic_event_sv_gene1', + 'cbio', + 'P@ssword1' + ); + + +insert into cbioportal.genomic_event +select * from mysql( + '127.0.0.1:3306', + 'cbioportal', + 'view_genomic_event_sv_gene2', + 'cbio', + 'P@ssword1' + ); \ No newline at end of file diff --git a/dev/clickhouse/mysql_provisioning/cbio_database_views.sql b/dev/clickhouse/mysql_provisioning/cbio_database_views.sql index 88fdabd618e..0556c06e86e 100644 --- a/dev/clickhouse/mysql_provisioning/cbio_database_views.sql +++ b/dev/clickhouse/mysql_provisioning/cbio_database_views.sql @@ -3,8 +3,10 @@ DROP VIEW IF EXISTS view_sample; CREATE VIEW view_sample AS SELECT concat(cs.CANCER_STUDY_IDENTIFIER, '_', sample.STABLE_ID) as sample_unique_id, + TO_BASE64(concat(cs.CANCER_STUDY_IDENTIFIER, '_', sample.STABLE_ID)) as sample_unique_id_base64, sample.STABLE_ID as sample_stable_id, concat(cs.CANCER_STUDY_IDENTIFIER, '_', p.STABLE_ID) as patient_unique_id, + TO_BASE64(concat(cs.CANCER_STUDY_IDENTIFIER, '_', p.STABLE_ID)) as patient_unique_id_base64, p.STABLE_ID as patient_stable_id, cs.CANCER_STUDY_IDENTIFIER as cancer_study_identifier FROM sample @@ -25,70 +27,81 @@ FROM sample_list as sl INNER JOIN cancer_study cs on sl.CANCER_STUDY_ID = cs.CANCER_STUDY_ID; # genomic_event -DROP TABLE IF EXISTS view_genomic_event; +DROP VIEW IF EXISTS view_genomic_event; +DROP VIEW IF EXISTS view_genomic_event_mutation; -- This view takes a long time to materialize. I store the data in a table to prevent repeated recalculations. -# CREATE TABLE view_genomic_event AS -# SELECT -# concat(cs.CANCER_STUDY_IDENTIFIER, '_', sample.STABLE_ID) as sample_unique_id, -# gene.HUGO_GENE_SYMBOL as hugo_gene_symbol, -# me.PROTEIN_CHANGE as variant, -# gp.STABLE_ID as gene_panel_stable_id, -# cs.CANCER_STUDY_IDENTIFIER as cancer_study_identifier, -# g.STABLE_ID as genetic_profile_stable_id -# FROM mutation -# LEFT JOIN mutation_event as me ON mutation.MUTATION_EVENT_ID = me.MUTATION_EVENT_ID -# LEFT JOIN sample_profile sp on mutation.SAMPLE_ID = sp.SAMPLE_ID and mutation.GENETIC_PROFILE_ID = sp.GENETIC_PROFILE_ID -# LEFT JOIN gene_panel gp on sp.PANEL_ID = gp.INTERNAL_ID -# LEFT JOIN genetic_profile g on sp.GENETIC_PROFILE_ID = g.GENETIC_PROFILE_ID -# LEFT JOIN cancer_study cs on g.CANCER_STUDY_ID = cs.CANCER_STUDY_ID -# LEFT JOIN sample on mutation.SAMPLE_ID = sample.INTERNAL_ID -# LEFT JOIN gene ON mutation.ENTREZ_GENE_ID = gene.ENTREZ_GENE_ID; -# INSERT INTO view_genomic_event -# SELECT -# concat(cs.CANCER_STUDY_IDENTIFIER, '_', sample.STABLE_ID) as sample_unique_id, -# gene.HUGO_GENE_SYMBOL as hugo_gene_symbol, -# convert(ce.ALTERATION, char) as variant, -# gene_panel.STABLE_ID as gene_panel_stable_id, -# cs.CANCER_STUDY_IDENTIFIER as cancer_study_identifier, -# gp.STABLE_ID as genetic_profile_stable_id -# FROM sample_cna_event -# LEFT JOIN cna_event ce on sample_cna_event.CNA_EVENT_ID = ce.CNA_EVENT_ID -# LEFT JOIN gene on ce.ENTREZ_GENE_ID = gene.ENTREZ_GENE_ID -# LEFT JOIN genetic_profile gp on sample_cna_event.GENETIC_PROFILE_ID = gp.GENETIC_PROFILE_ID -# LEFT JOIN sample_profile sp on gp.GENETIC_PROFILE_ID = sp.GENETIC_PROFILE_ID -# LEFT JOIN cancer_study cs on gp.CANCER_STUDY_ID = cs.CANCER_STUDY_ID -# LEFT JOIN sample on sample_cna_event.SAMPLE_ID = sample.INTERNAL_ID -# LEFT JOIN gene_panel ON sp.PANEL_ID = gene_panel.INTERNAL_ID; -# INSERT INTO view_genomic_event -# SELECT -# concat(cs.CANCER_STUDY_IDENTIFIER, '_', s.STABLE_ID) as sample_unique_id, -# hugo_gene_symbol, -# Event_Info as variant, -# g.STABLE_ID as gene_panel_stable_id, -# cs.CANCER_STUDY_IDENTIFIER as cancer_study_identifier, -# gp.STABLE_ID as genetic_profile_stable_id -# FROM structural_variant as sv -# LEFT JOIN (SELECT ENTREZ_GENE_ID, HUGO_GENE_SYMBOL as hugo_gene_symbol FROM gene) gene1 on gene1.ENTREZ_GENE_ID = sv.SITE1_ENTREZ_GENE_ID -# LEFT OUTER JOIN genetic_profile gp on gp.GENETIC_PROFILE_ID = sv.GENETIC_PROFILE_ID -# LEFT JOIN sample s on sv.SAMPLE_ID = s.INTERNAL_ID -# LEFT JOIN cancer_study cs on gp.CANCER_STUDY_ID = cs.CANCER_STUDY_ID -# LEFT JOIN sample_profile sp on gp.GENETIC_PROFILE_ID = sp.GENETIC_PROFILE_ID -# LEFT JOIN gene_panel g on sp.PANEL_ID = g.INTERNAL_ID; -# INSERT INTO view_genomic_event -# SELECT -# concat(cs.CANCER_STUDY_IDENTIFIER, '_', s.STABLE_ID) as sample_unique_id, -# hugo_gene_symbol, -# Event_Info as variant, -# g.STABLE_ID as gene_panel_stable_id, -# cs.CANCER_STUDY_IDENTIFIER as cancer_study_identifier, -# gp.STABLE_ID as genetic_profile_stable_id -# FROM structural_variant as sv -# LEFT JOIN (SELECT ENTREZ_GENE_ID, HUGO_GENE_SYMBOL as hugo_gene_symbol FROM gene) gene2 on gene2.ENTREZ_GENE_ID = sv.SITE2_ENTREZ_GENE_ID -# LEFT OUTER JOIN genetic_profile gp on gp.GENETIC_PROFILE_ID = sv.GENETIC_PROFILE_ID -# LEFT JOIN sample s on sv.SAMPLE_ID = s.INTERNAL_ID -# LEFT JOIN cancer_study cs on gp.CANCER_STUDY_ID = cs.CANCER_STUDY_ID -# LEFT JOIN sample_profile sp on gp.GENETIC_PROFILE_ID = sp.GENETIC_PROFILE_ID -# LEFT JOIN gene_panel g on sp.PANEL_ID = g.INTERNAL_ID; +CREATE VIEW view_genomic_event_mutation AS +SELECT + concat(cs.CANCER_STUDY_IDENTIFIER, '_', sample.STABLE_ID) as sample_unique_id, + gene.HUGO_GENE_SYMBOL as hugo_gene_symbol, + me.PROTEIN_CHANGE as variant, + 'mutation', + gp.STABLE_ID as gene_panel_stable_id, + cs.CANCER_STUDY_IDENTIFIER as cancer_study_identifier, + g.STABLE_ID as genetic_profile_stable_id +FROM mutation + LEFT JOIN mutation_event as me ON mutation.MUTATION_EVENT_ID = me.MUTATION_EVENT_ID + LEFT JOIN sample_profile sp on mutation.SAMPLE_ID = sp.SAMPLE_ID and mutation.GENETIC_PROFILE_ID = sp.GENETIC_PROFILE_ID + LEFT JOIN gene_panel gp on sp.PANEL_ID = gp.INTERNAL_ID + LEFT JOIN genetic_profile g on sp.GENETIC_PROFILE_ID = g.GENETIC_PROFILE_ID + LEFT JOIN cancer_study cs on g.CANCER_STUDY_ID = cs.CANCER_STUDY_ID + LEFT JOIN sample on mutation.SAMPLE_ID = sample.INTERNAL_ID + LEFT JOIN gene ON mutation.ENTREZ_GENE_ID = gene.ENTREZ_GENE_ID; + +DROP VIEW IF EXISTS view_genomic_event_cna; +CREATE VIEW view_genomic_event_cna AS +SELECT + concat(cs.CANCER_STUDY_IDENTIFIER, '_', sample.STABLE_ID) as sample_unique_id, + gene.HUGO_GENE_SYMBOL as hugo_gene_symbol, + convert(ce.ALTERATION, char) as variant, + 'cna', + gene_panel.STABLE_ID as gene_panel_stable_id, + cs.CANCER_STUDY_IDENTIFIER as cancer_study_identifier, + gp.STABLE_ID as genetic_profile_stable_id +FROM sample_cna_event + INNER JOIN cna_event ce on sample_cna_event.CNA_EVENT_ID = ce.CNA_EVENT_ID + INNER JOIN gene on ce.ENTREZ_GENE_ID = gene.ENTREZ_GENE_ID + INNER JOIN genetic_profile gp on sample_cna_event.GENETIC_PROFILE_ID = gp.GENETIC_PROFILE_ID + INNER JOIN sample on sample_cna_event.SAMPLE_ID = sample.INTERNAL_ID + INNER JOIN sample_profile sp on gp.GENETIC_PROFILE_ID = sp.GENETIC_PROFILE_ID AND sp.SAMPLE_ID = sample.INTERNAL_ID + INNER JOIN cancer_study cs on gp.CANCER_STUDY_ID = cs.CANCER_STUDY_ID + INNER JOIN gene_panel ON sp.PANEL_ID = gene_panel.INTERNAL_ID; + +DROP VIEW IF EXISTS view_genomic_event_sv_gene1; +CREATE VIEW view_genomic_event_sv_gene1 AS +SELECT + concat(cs.CANCER_STUDY_IDENTIFIER, '_', s.STABLE_ID) as sample_unique_id, + hugo_gene_symbol, + Event_Info as variant, + 'sv', + g.STABLE_ID as gene_panel_stable_id, + cs.CANCER_STUDY_IDENTIFIER as cancer_study_identifier, + gp.STABLE_ID as genetic_profile_stable_id +FROM structural_variant as sv + INNER JOIN (SELECT ENTREZ_GENE_ID, HUGO_GENE_SYMBOL as hugo_gene_symbol FROM gene) gene1 on gene1.ENTREZ_GENE_ID = sv.SITE1_ENTREZ_GENE_ID + INNER JOIN genetic_profile gp on gp.GENETIC_PROFILE_ID = sv.GENETIC_PROFILE_ID + INNER JOIN sample s on sv.SAMPLE_ID = s.INTERNAL_ID + INNER JOIN cancer_study cs on gp.CANCER_STUDY_ID = cs.CANCER_STUDY_ID + INNER JOIN sample_profile sp on sp.SAMPLE_ID = sv.SAMPLE_ID and sp.GENETIC_PROFILE_ID = sv.GENETIC_PROFILE_ID + INNER JOIN gene_panel g on sp.PANEL_ID = g.INTERNAL_ID; + +DROP VIEW IF EXISTS view_genomic_event_sv_gene2; +CREATE VIEW view_genomic_event_sv_gene2 AS +SELECT + concat(cs.CANCER_STUDY_IDENTIFIER, '_', s.STABLE_ID) as sample_unique_id, + hugo_gene_symbol, + Event_Info as variant, + 'sv', + g.STABLE_ID as gene_panel_stable_id, + cs.CANCER_STUDY_IDENTIFIER as cancer_study_identifier, + gp.STABLE_ID as genetic_profile_stable_id +FROM structural_variant as sv + INNER JOIN (SELECT ENTREZ_GENE_ID, HUGO_GENE_SYMBOL as hugo_gene_symbol FROM gene) gene2 on gene2.ENTREZ_GENE_ID = sv.SITE2_ENTREZ_GENE_ID + INNER JOIN genetic_profile gp on gp.GENETIC_PROFILE_ID = sv.GENETIC_PROFILE_ID + INNER JOIN sample s on sv.SAMPLE_ID = s.INTERNAL_ID + INNER JOIN cancer_study cs on gp.CANCER_STUDY_ID = cs.CANCER_STUDY_ID + INNER JOIN sample_profile sp on sp.SAMPLE_ID = sv.SAMPLE_ID and sp.GENETIC_PROFILE_ID = sv.GENETIC_PROFILE_ID + INNER JOIN gene_panel g on sp.PANEL_ID = g.INTERNAL_ID; -- structural variant DROP VIEW IF EXISTS view_structural_variant; @@ -101,25 +114,25 @@ SELECT cs.CANCER_STUDY_IDENTIFIER as cancer_study_identifier, gp.STABLE_ID as genetic_profile_stable_id FROM structural_variant as sv - LEFT JOIN (SELECT ENTREZ_GENE_ID, HUGO_GENE_SYMBOL FROM gene) gene1 on gene1.ENTREZ_GENE_ID = sv.SITE1_ENTREZ_GENE_ID - LEFT JOIN (SELECT ENTREZ_GENE_ID, HUGO_GENE_SYMBOL FROM gene) gene2 on gene2.ENTREZ_GENE_ID = sv.SITE2_ENTREZ_GENE_ID - LEFT OUTER JOIN genetic_profile gp on gp.GENETIC_PROFILE_ID = sv.GENETIC_PROFILE_ID - LEFT JOIN sample s on sv.SAMPLE_ID = s.INTERNAL_ID - LEFT JOIN cancer_study cs on gp.CANCER_STUDY_ID = cs.CANCER_STUDY_ID - LEFT JOIN sample_profile sp on gp.GENETIC_PROFILE_ID = sp.GENETIC_PROFILE_ID - LEFT JOIN gene_panel g on sp.PANEL_ID = g.INTERNAL_ID; + INNER JOIN (SELECT ENTREZ_GENE_ID, HUGO_GENE_SYMBOL FROM gene) gene1 on gene1.ENTREZ_GENE_ID = sv.SITE1_ENTREZ_GENE_ID + INNER JOIN (SELECT ENTREZ_GENE_ID, HUGO_GENE_SYMBOL FROM gene) gene2 on gene2.ENTREZ_GENE_ID = sv.SITE2_ENTREZ_GENE_ID + INNER JOIN genetic_profile gp on gp.GENETIC_PROFILE_ID = sv.GENETIC_PROFILE_ID + INNER JOIN sample s on sv.SAMPLE_ID = s.INTERNAL_ID + INNER JOIN cancer_study cs on gp.CANCER_STUDY_ID = cs.CANCER_STUDY_ID + INNER JOIN sample_profile sp on gp.GENETIC_PROFILE_ID = sp.GENETIC_PROFILE_ID AND sp.SAMPLE_ID = sv.SAMPLE_ID + INNER JOIN gene_panel g on sp.PANEL_ID = g.INTERNAL_ID; -- sample_clinical_attribute_numeric DROP VIEW IF EXISTS view_sample_clinical_attribute_numeric; CREATE VIEW view_sample_clinical_attribute_numeric AS SELECT - concat(cs.CANCER_STUDY_IDENTIFIER, '_', s.STABLE_ID) as sample_unique_id, - concat(cs.CANCER_STUDY_IDENTIFIER, '_', p.STABLE_ID) as patient_unique_id, + concat(cancer_study.CANCER_STUDY_IDENTIFIER, '_', s.STABLE_ID) as sample_unique_id, + concat(cancer_study.CANCER_STUDY_IDENTIFIER, '_', p.STABLE_ID) as patient_unique_id, ATTR_ID as attribute_name, ATTR_VALUE as attribute_value, - cs.CANCER_STUDY_IDENTIFIER as cancer_study_identifier -FROM cancer_study cs - INNER JOIN patient p on cs.CANCER_STUDY_ID = p.CANCER_STUDY_ID + cancer_study.CANCER_STUDY_IDENTIFIER as cancer_study_identifier +FROM cancer_study + INNER JOIN patient p on cancer_study.CANCER_STUDY_ID = p.CANCER_STUDY_ID INNER JOIN sample s on p.INTERNAL_ID = s.PATIENT_ID INNER JOIN clinical_sample cs on s.INTERNAL_ID = cs.INTERNAL_ID WHERE ATTR_VALUE REGEXP '^[0-9.]+$'; @@ -128,13 +141,13 @@ WHERE ATTR_VALUE REGEXP '^[0-9.]+$'; DROP VIEW IF EXISTS view_sample_clinical_attribute_categorical; CREATE VIEW view_sample_clinical_attribute_categorical AS SELECT - concat(cs.CANCER_STUDY_IDENTIFIER, '_', s.STABLE_ID) as sample_unique_id, - concat(cs.CANCER_STUDY_IDENTIFIER, '_', p.STABLE_ID) as patient_unique_id, + concat(cancer_study.CANCER_STUDY_IDENTIFIER, '_', s.STABLE_ID) as sample_unique_id, + concat(cancer_study.CANCER_STUDY_IDENTIFIER, '_', p.STABLE_ID) as patient_unique_id, ATTR_ID as attribute_name, ATTR_VALUE as attribute_value, - cs.CANCER_STUDY_IDENTIFIER as cancer_study_identifier -FROM cancer_study cs - INNER JOIN patient p on cs.CANCER_STUDY_ID = p.CANCER_STUDY_ID + cancer_study.CANCER_STUDY_IDENTIFIER as cancer_study_identifier +FROM cancer_study + INNER JOIN patient p on cancer_study.CANCER_STUDY_ID = p.CANCER_STUDY_ID INNER JOIN sample s on p.INTERNAL_ID = s.PATIENT_ID INNER JOIN clinical_sample cs on s.INTERNAL_ID = cs.INTERNAL_ID WHERE ATTR_VALUE NOT REGEXP '^[0-9.]+$'; @@ -143,12 +156,12 @@ WHERE ATTR_VALUE NOT REGEXP '^[0-9.]+$'; DROP VIEW IF EXISTS view_patient_clinical_attribute_numeric; CREATE VIEW view_patient_clinical_attribute_numeric AS SELECT - concat(cs.CANCER_STUDY_IDENTIFIER, '_', p.STABLE_ID) as patient_unique_id, + concat(cancer_study.CANCER_STUDY_IDENTIFIER, '_', p.STABLE_ID) as patient_unique_id, ATTR_ID as attribute_name, ATTR_VALUE as attribute_value, - cs.CANCER_STUDY_IDENTIFIER as cancer_study_identifier -FROM cancer_study cs - INNER JOIN patient p on cs.CANCER_STUDY_ID = p.CANCER_STUDY_ID + cancer_study.CANCER_STUDY_IDENTIFIER as cancer_study_identifier +FROM cancer_study + INNER JOIN patient p on cancer_study.CANCER_STUDY_ID = p.CANCER_STUDY_ID INNER JOIN clinical_patient cp on p.INTERNAL_ID = cp.INTERNAL_ID WHERE ATTR_VALUE REGEXP '^[0-9.]+$'; @@ -156,12 +169,12 @@ WHERE ATTR_VALUE REGEXP '^[0-9.]+$'; DROP VIEW IF EXISTS view_patient_clinical_attribute_categorical; CREATE VIEW view_patient_clinical_attribute_categorical AS SELECT - concat(cs.CANCER_STUDY_IDENTIFIER, '_', p.STABLE_ID) as patient_unique_id, + concat(cancer_study.CANCER_STUDY_IDENTIFIER, '_', p.STABLE_ID) as patient_unique_id, ATTR_ID as attribute_name, ATTR_VALUE as attribute_value, - cs.CANCER_STUDY_IDENTIFIER as cancer_study_identifier -FROM cancer_study cs - INNER JOIN patient p on cs.CANCER_STUDY_ID = p.CANCER_STUDY_ID + cancer_study.CANCER_STUDY_IDENTIFIER as cancer_study_identifier +FROM cancer_study + INNER JOIN patient p on cancer_study.CANCER_STUDY_ID = p.CANCER_STUDY_ID INNER JOIN clinical_patient cp on p.INTERNAL_ID = cp.INTERNAL_ID WHERE ATTR_VALUE NOT REGEXP '^[0-9.]+$';