Skip to content

Commit

Permalink
Cover all clickhouse tables with mysql import
Browse files Browse the repository at this point in the history
  • Loading branch information
pvannierop committed Oct 25, 2023
1 parent 25f780a commit c1b940d
Show file tree
Hide file tree
Showing 2 changed files with 208 additions and 88 deletions.
107 changes: 107 additions & 0 deletions dev/clickhouse/clickhouse_provisioning/c_import_mysql_data.sql
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,110 @@ select * from mysql(
'cbio',
'P@ssword1'
);

insert into cbioportal.sample_list
select * from mysql(
'127.0.0.1:3306',
'cbioportal',
'view_sample_list',
'cbio',
'P@ssword1'
);

insert into cbioportal.structural_variant
select * from mysql(
'127.0.0.1:3306',
'cbioportal',
'view_structural_variant',
'cbio',
'P@ssword1'
);

insert into cbioportal.sample
select * from mysql(
'127.0.0.1:3306',
'cbioportal',
'view_sample',
'cbio',
'P@ssword1'
);


insert into cbioportal.sample_clinical_attribute_numeric
select * from mysql(
'127.0.0.1:3306',
'cbioportal',
'view_sample_clinical_attribute_numeric',
'cbio',
'P@ssword1'
);


insert into cbioportal.sample_clinical_attribute_categorical
select * from mysql(
'127.0.0.1:3306',
'cbioportal',
'view_sample_clinical_attribute_categorical',
'cbio',
'P@ssword1'
);


insert into cbioportal.patient_clinical_attribute_numeric
select * from mysql(
'127.0.0.1:3306',
'cbioportal',
'view_patient_clinical_attribute_numeric',
'cbio',
'P@ssword1'
);



insert into cbioportal.patient_clinical_attribute_categorical
select * from mysql(
'127.0.0.1:3306',
'cbioportal',
'view_patient_clinical_attribute_categorical',
'cbio',
'P@ssword1'
);


insert into cbioportal.genomic_event
select * from mysql(
'127.0.0.1:3306',
'cbioportal',
'view_genomic_event_mutation',
'cbio',
'P@ssword1'
);


insert into cbioportal.genomic_event
select * from mysql(
'127.0.0.1:3306',
'cbioportal',
'view_genomic_event_cna',
'cbio',
'P@ssword1'
);

insert into cbioportal.genomic_event
select * from mysql(
'127.0.0.1:3306',
'cbioportal',
'view_genomic_event_sv_gene1',
'cbio',
'P@ssword1'
);


insert into cbioportal.genomic_event
select * from mysql(
'127.0.0.1:3306',
'cbioportal',
'view_genomic_event_sv_gene2',
'cbio',
'P@ssword1'
);
189 changes: 101 additions & 88 deletions dev/clickhouse/mysql_provisioning/cbio_database_views.sql
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@ DROP VIEW IF EXISTS view_sample;
CREATE VIEW view_sample AS
SELECT
concat(cs.CANCER_STUDY_IDENTIFIER, '_', sample.STABLE_ID) as sample_unique_id,
TO_BASE64(concat(cs.CANCER_STUDY_IDENTIFIER, '_', sample.STABLE_ID)) as sample_unique_id_base64,
sample.STABLE_ID as sample_stable_id,
concat(cs.CANCER_STUDY_IDENTIFIER, '_', p.STABLE_ID) as patient_unique_id,
TO_BASE64(concat(cs.CANCER_STUDY_IDENTIFIER, '_', p.STABLE_ID)) as patient_unique_id_base64,
p.STABLE_ID as patient_stable_id,
cs.CANCER_STUDY_IDENTIFIER as cancer_study_identifier
FROM sample
Expand All @@ -25,70 +27,81 @@ FROM sample_list as sl
INNER JOIN cancer_study cs on sl.CANCER_STUDY_ID = cs.CANCER_STUDY_ID;

# genomic_event
DROP TABLE IF EXISTS view_genomic_event;
DROP VIEW IF EXISTS view_genomic_event;
DROP VIEW IF EXISTS view_genomic_event_mutation;
-- This view takes a long time to materialize. I store the data in a table to prevent repeated recalculations.
# CREATE TABLE view_genomic_event AS
# SELECT
# concat(cs.CANCER_STUDY_IDENTIFIER, '_', sample.STABLE_ID) as sample_unique_id,
# gene.HUGO_GENE_SYMBOL as hugo_gene_symbol,
# me.PROTEIN_CHANGE as variant,
# gp.STABLE_ID as gene_panel_stable_id,
# cs.CANCER_STUDY_IDENTIFIER as cancer_study_identifier,
# g.STABLE_ID as genetic_profile_stable_id
# FROM mutation
# LEFT JOIN mutation_event as me ON mutation.MUTATION_EVENT_ID = me.MUTATION_EVENT_ID
# LEFT JOIN sample_profile sp on mutation.SAMPLE_ID = sp.SAMPLE_ID and mutation.GENETIC_PROFILE_ID = sp.GENETIC_PROFILE_ID
# LEFT JOIN gene_panel gp on sp.PANEL_ID = gp.INTERNAL_ID
# LEFT JOIN genetic_profile g on sp.GENETIC_PROFILE_ID = g.GENETIC_PROFILE_ID
# LEFT JOIN cancer_study cs on g.CANCER_STUDY_ID = cs.CANCER_STUDY_ID
# LEFT JOIN sample on mutation.SAMPLE_ID = sample.INTERNAL_ID
# LEFT JOIN gene ON mutation.ENTREZ_GENE_ID = gene.ENTREZ_GENE_ID;
# INSERT INTO view_genomic_event
# SELECT
# concat(cs.CANCER_STUDY_IDENTIFIER, '_', sample.STABLE_ID) as sample_unique_id,
# gene.HUGO_GENE_SYMBOL as hugo_gene_symbol,
# convert(ce.ALTERATION, char) as variant,
# gene_panel.STABLE_ID as gene_panel_stable_id,
# cs.CANCER_STUDY_IDENTIFIER as cancer_study_identifier,
# gp.STABLE_ID as genetic_profile_stable_id
# FROM sample_cna_event
# LEFT JOIN cna_event ce on sample_cna_event.CNA_EVENT_ID = ce.CNA_EVENT_ID
# LEFT JOIN gene on ce.ENTREZ_GENE_ID = gene.ENTREZ_GENE_ID
# LEFT JOIN genetic_profile gp on sample_cna_event.GENETIC_PROFILE_ID = gp.GENETIC_PROFILE_ID
# LEFT JOIN sample_profile sp on gp.GENETIC_PROFILE_ID = sp.GENETIC_PROFILE_ID
# LEFT JOIN cancer_study cs on gp.CANCER_STUDY_ID = cs.CANCER_STUDY_ID
# LEFT JOIN sample on sample_cna_event.SAMPLE_ID = sample.INTERNAL_ID
# LEFT JOIN gene_panel ON sp.PANEL_ID = gene_panel.INTERNAL_ID;
# INSERT INTO view_genomic_event
# SELECT
# concat(cs.CANCER_STUDY_IDENTIFIER, '_', s.STABLE_ID) as sample_unique_id,
# hugo_gene_symbol,
# Event_Info as variant,
# g.STABLE_ID as gene_panel_stable_id,
# cs.CANCER_STUDY_IDENTIFIER as cancer_study_identifier,
# gp.STABLE_ID as genetic_profile_stable_id
# FROM structural_variant as sv
# LEFT JOIN (SELECT ENTREZ_GENE_ID, HUGO_GENE_SYMBOL as hugo_gene_symbol FROM gene) gene1 on gene1.ENTREZ_GENE_ID = sv.SITE1_ENTREZ_GENE_ID
# LEFT OUTER JOIN genetic_profile gp on gp.GENETIC_PROFILE_ID = sv.GENETIC_PROFILE_ID
# LEFT JOIN sample s on sv.SAMPLE_ID = s.INTERNAL_ID
# LEFT JOIN cancer_study cs on gp.CANCER_STUDY_ID = cs.CANCER_STUDY_ID
# LEFT JOIN sample_profile sp on gp.GENETIC_PROFILE_ID = sp.GENETIC_PROFILE_ID
# LEFT JOIN gene_panel g on sp.PANEL_ID = g.INTERNAL_ID;
# INSERT INTO view_genomic_event
# SELECT
# concat(cs.CANCER_STUDY_IDENTIFIER, '_', s.STABLE_ID) as sample_unique_id,
# hugo_gene_symbol,
# Event_Info as variant,
# g.STABLE_ID as gene_panel_stable_id,
# cs.CANCER_STUDY_IDENTIFIER as cancer_study_identifier,
# gp.STABLE_ID as genetic_profile_stable_id
# FROM structural_variant as sv
# LEFT JOIN (SELECT ENTREZ_GENE_ID, HUGO_GENE_SYMBOL as hugo_gene_symbol FROM gene) gene2 on gene2.ENTREZ_GENE_ID = sv.SITE2_ENTREZ_GENE_ID
# LEFT OUTER JOIN genetic_profile gp on gp.GENETIC_PROFILE_ID = sv.GENETIC_PROFILE_ID
# LEFT JOIN sample s on sv.SAMPLE_ID = s.INTERNAL_ID
# LEFT JOIN cancer_study cs on gp.CANCER_STUDY_ID = cs.CANCER_STUDY_ID
# LEFT JOIN sample_profile sp on gp.GENETIC_PROFILE_ID = sp.GENETIC_PROFILE_ID
# LEFT JOIN gene_panel g on sp.PANEL_ID = g.INTERNAL_ID;
CREATE VIEW view_genomic_event_mutation AS
SELECT
concat(cs.CANCER_STUDY_IDENTIFIER, '_', sample.STABLE_ID) as sample_unique_id,
gene.HUGO_GENE_SYMBOL as hugo_gene_symbol,
me.PROTEIN_CHANGE as variant,
'mutation',
gp.STABLE_ID as gene_panel_stable_id,
cs.CANCER_STUDY_IDENTIFIER as cancer_study_identifier,
g.STABLE_ID as genetic_profile_stable_id
FROM mutation
LEFT JOIN mutation_event as me ON mutation.MUTATION_EVENT_ID = me.MUTATION_EVENT_ID
LEFT JOIN sample_profile sp on mutation.SAMPLE_ID = sp.SAMPLE_ID and mutation.GENETIC_PROFILE_ID = sp.GENETIC_PROFILE_ID
LEFT JOIN gene_panel gp on sp.PANEL_ID = gp.INTERNAL_ID
LEFT JOIN genetic_profile g on sp.GENETIC_PROFILE_ID = g.GENETIC_PROFILE_ID
LEFT JOIN cancer_study cs on g.CANCER_STUDY_ID = cs.CANCER_STUDY_ID
LEFT JOIN sample on mutation.SAMPLE_ID = sample.INTERNAL_ID
LEFT JOIN gene ON mutation.ENTREZ_GENE_ID = gene.ENTREZ_GENE_ID;

DROP VIEW IF EXISTS view_genomic_event_cna;
CREATE VIEW view_genomic_event_cna AS
SELECT
concat(cs.CANCER_STUDY_IDENTIFIER, '_', sample.STABLE_ID) as sample_unique_id,
gene.HUGO_GENE_SYMBOL as hugo_gene_symbol,
convert(ce.ALTERATION, char) as variant,
'cna',
gene_panel.STABLE_ID as gene_panel_stable_id,
cs.CANCER_STUDY_IDENTIFIER as cancer_study_identifier,
gp.STABLE_ID as genetic_profile_stable_id
FROM sample_cna_event
INNER JOIN cna_event ce on sample_cna_event.CNA_EVENT_ID = ce.CNA_EVENT_ID
INNER JOIN gene on ce.ENTREZ_GENE_ID = gene.ENTREZ_GENE_ID
INNER JOIN genetic_profile gp on sample_cna_event.GENETIC_PROFILE_ID = gp.GENETIC_PROFILE_ID
INNER JOIN sample on sample_cna_event.SAMPLE_ID = sample.INTERNAL_ID
INNER JOIN sample_profile sp on gp.GENETIC_PROFILE_ID = sp.GENETIC_PROFILE_ID AND sp.SAMPLE_ID = sample.INTERNAL_ID
INNER JOIN cancer_study cs on gp.CANCER_STUDY_ID = cs.CANCER_STUDY_ID
INNER JOIN gene_panel ON sp.PANEL_ID = gene_panel.INTERNAL_ID;

DROP VIEW IF EXISTS view_genomic_event_sv_gene1;
CREATE VIEW view_genomic_event_sv_gene1 AS
SELECT
concat(cs.CANCER_STUDY_IDENTIFIER, '_', s.STABLE_ID) as sample_unique_id,
hugo_gene_symbol,
Event_Info as variant,
'sv',
g.STABLE_ID as gene_panel_stable_id,
cs.CANCER_STUDY_IDENTIFIER as cancer_study_identifier,
gp.STABLE_ID as genetic_profile_stable_id
FROM structural_variant as sv
INNER JOIN (SELECT ENTREZ_GENE_ID, HUGO_GENE_SYMBOL as hugo_gene_symbol FROM gene) gene1 on gene1.ENTREZ_GENE_ID = sv.SITE1_ENTREZ_GENE_ID
INNER JOIN genetic_profile gp on gp.GENETIC_PROFILE_ID = sv.GENETIC_PROFILE_ID
INNER JOIN sample s on sv.SAMPLE_ID = s.INTERNAL_ID
INNER JOIN cancer_study cs on gp.CANCER_STUDY_ID = cs.CANCER_STUDY_ID
INNER JOIN sample_profile sp on sp.SAMPLE_ID = sv.SAMPLE_ID and sp.GENETIC_PROFILE_ID = sv.GENETIC_PROFILE_ID
INNER JOIN gene_panel g on sp.PANEL_ID = g.INTERNAL_ID;

DROP VIEW IF EXISTS view_genomic_event_sv_gene2;
CREATE VIEW view_genomic_event_sv_gene2 AS
SELECT
concat(cs.CANCER_STUDY_IDENTIFIER, '_', s.STABLE_ID) as sample_unique_id,
hugo_gene_symbol,
Event_Info as variant,
'sv',
g.STABLE_ID as gene_panel_stable_id,
cs.CANCER_STUDY_IDENTIFIER as cancer_study_identifier,
gp.STABLE_ID as genetic_profile_stable_id
FROM structural_variant as sv
INNER JOIN (SELECT ENTREZ_GENE_ID, HUGO_GENE_SYMBOL as hugo_gene_symbol FROM gene) gene2 on gene2.ENTREZ_GENE_ID = sv.SITE2_ENTREZ_GENE_ID
INNER JOIN genetic_profile gp on gp.GENETIC_PROFILE_ID = sv.GENETIC_PROFILE_ID
INNER JOIN sample s on sv.SAMPLE_ID = s.INTERNAL_ID
INNER JOIN cancer_study cs on gp.CANCER_STUDY_ID = cs.CANCER_STUDY_ID
INNER JOIN sample_profile sp on sp.SAMPLE_ID = sv.SAMPLE_ID and sp.GENETIC_PROFILE_ID = sv.GENETIC_PROFILE_ID
INNER JOIN gene_panel g on sp.PANEL_ID = g.INTERNAL_ID;

-- structural variant
DROP VIEW IF EXISTS view_structural_variant;
Expand All @@ -101,25 +114,25 @@ SELECT
cs.CANCER_STUDY_IDENTIFIER as cancer_study_identifier,
gp.STABLE_ID as genetic_profile_stable_id
FROM structural_variant as sv
LEFT JOIN (SELECT ENTREZ_GENE_ID, HUGO_GENE_SYMBOL FROM gene) gene1 on gene1.ENTREZ_GENE_ID = sv.SITE1_ENTREZ_GENE_ID
LEFT JOIN (SELECT ENTREZ_GENE_ID, HUGO_GENE_SYMBOL FROM gene) gene2 on gene2.ENTREZ_GENE_ID = sv.SITE2_ENTREZ_GENE_ID
LEFT OUTER JOIN genetic_profile gp on gp.GENETIC_PROFILE_ID = sv.GENETIC_PROFILE_ID
LEFT JOIN sample s on sv.SAMPLE_ID = s.INTERNAL_ID
LEFT JOIN cancer_study cs on gp.CANCER_STUDY_ID = cs.CANCER_STUDY_ID
LEFT JOIN sample_profile sp on gp.GENETIC_PROFILE_ID = sp.GENETIC_PROFILE_ID
LEFT JOIN gene_panel g on sp.PANEL_ID = g.INTERNAL_ID;
INNER JOIN (SELECT ENTREZ_GENE_ID, HUGO_GENE_SYMBOL FROM gene) gene1 on gene1.ENTREZ_GENE_ID = sv.SITE1_ENTREZ_GENE_ID
INNER JOIN (SELECT ENTREZ_GENE_ID, HUGO_GENE_SYMBOL FROM gene) gene2 on gene2.ENTREZ_GENE_ID = sv.SITE2_ENTREZ_GENE_ID
INNER JOIN genetic_profile gp on gp.GENETIC_PROFILE_ID = sv.GENETIC_PROFILE_ID
INNER JOIN sample s on sv.SAMPLE_ID = s.INTERNAL_ID
INNER JOIN cancer_study cs on gp.CANCER_STUDY_ID = cs.CANCER_STUDY_ID
INNER JOIN sample_profile sp on gp.GENETIC_PROFILE_ID = sp.GENETIC_PROFILE_ID AND sp.SAMPLE_ID = sv.SAMPLE_ID
INNER JOIN gene_panel g on sp.PANEL_ID = g.INTERNAL_ID;

-- sample_clinical_attribute_numeric
DROP VIEW IF EXISTS view_sample_clinical_attribute_numeric;
CREATE VIEW view_sample_clinical_attribute_numeric AS
SELECT
concat(cs.CANCER_STUDY_IDENTIFIER, '_', s.STABLE_ID) as sample_unique_id,
concat(cs.CANCER_STUDY_IDENTIFIER, '_', p.STABLE_ID) as patient_unique_id,
concat(cancer_study.CANCER_STUDY_IDENTIFIER, '_', s.STABLE_ID) as sample_unique_id,
concat(cancer_study.CANCER_STUDY_IDENTIFIER, '_', p.STABLE_ID) as patient_unique_id,
ATTR_ID as attribute_name,
ATTR_VALUE as attribute_value,
cs.CANCER_STUDY_IDENTIFIER as cancer_study_identifier
FROM cancer_study cs
INNER JOIN patient p on cs.CANCER_STUDY_ID = p.CANCER_STUDY_ID
cancer_study.CANCER_STUDY_IDENTIFIER as cancer_study_identifier
FROM cancer_study
INNER JOIN patient p on cancer_study.CANCER_STUDY_ID = p.CANCER_STUDY_ID
INNER JOIN sample s on p.INTERNAL_ID = s.PATIENT_ID
INNER JOIN clinical_sample cs on s.INTERNAL_ID = cs.INTERNAL_ID
WHERE ATTR_VALUE REGEXP '^[0-9.]+$';
Expand All @@ -128,13 +141,13 @@ WHERE ATTR_VALUE REGEXP '^[0-9.]+$';
DROP VIEW IF EXISTS view_sample_clinical_attribute_categorical;
CREATE VIEW view_sample_clinical_attribute_categorical AS
SELECT
concat(cs.CANCER_STUDY_IDENTIFIER, '_', s.STABLE_ID) as sample_unique_id,
concat(cs.CANCER_STUDY_IDENTIFIER, '_', p.STABLE_ID) as patient_unique_id,
concat(cancer_study.CANCER_STUDY_IDENTIFIER, '_', s.STABLE_ID) as sample_unique_id,
concat(cancer_study.CANCER_STUDY_IDENTIFIER, '_', p.STABLE_ID) as patient_unique_id,
ATTR_ID as attribute_name,
ATTR_VALUE as attribute_value,
cs.CANCER_STUDY_IDENTIFIER as cancer_study_identifier
FROM cancer_study cs
INNER JOIN patient p on cs.CANCER_STUDY_ID = p.CANCER_STUDY_ID
cancer_study.CANCER_STUDY_IDENTIFIER as cancer_study_identifier
FROM cancer_study
INNER JOIN patient p on cancer_study.CANCER_STUDY_ID = p.CANCER_STUDY_ID
INNER JOIN sample s on p.INTERNAL_ID = s.PATIENT_ID
INNER JOIN clinical_sample cs on s.INTERNAL_ID = cs.INTERNAL_ID
WHERE ATTR_VALUE NOT REGEXP '^[0-9.]+$';
Expand All @@ -143,25 +156,25 @@ WHERE ATTR_VALUE NOT REGEXP '^[0-9.]+$';
DROP VIEW IF EXISTS view_patient_clinical_attribute_numeric;
CREATE VIEW view_patient_clinical_attribute_numeric AS
SELECT
concat(cs.CANCER_STUDY_IDENTIFIER, '_', p.STABLE_ID) as patient_unique_id,
concat(cancer_study.CANCER_STUDY_IDENTIFIER, '_', p.STABLE_ID) as patient_unique_id,
ATTR_ID as attribute_name,
ATTR_VALUE as attribute_value,
cs.CANCER_STUDY_IDENTIFIER as cancer_study_identifier
FROM cancer_study cs
INNER JOIN patient p on cs.CANCER_STUDY_ID = p.CANCER_STUDY_ID
cancer_study.CANCER_STUDY_IDENTIFIER as cancer_study_identifier
FROM cancer_study
INNER JOIN patient p on cancer_study.CANCER_STUDY_ID = p.CANCER_STUDY_ID
INNER JOIN clinical_patient cp on p.INTERNAL_ID = cp.INTERNAL_ID
WHERE ATTR_VALUE REGEXP '^[0-9.]+$';

-- patient_clinical_attribute_categorical
DROP VIEW IF EXISTS view_patient_clinical_attribute_categorical;
CREATE VIEW view_patient_clinical_attribute_categorical AS
SELECT
concat(cs.CANCER_STUDY_IDENTIFIER, '_', p.STABLE_ID) as patient_unique_id,
concat(cancer_study.CANCER_STUDY_IDENTIFIER, '_', p.STABLE_ID) as patient_unique_id,
ATTR_ID as attribute_name,
ATTR_VALUE as attribute_value,
cs.CANCER_STUDY_IDENTIFIER as cancer_study_identifier
FROM cancer_study cs
INNER JOIN patient p on cs.CANCER_STUDY_ID = p.CANCER_STUDY_ID
cancer_study.CANCER_STUDY_IDENTIFIER as cancer_study_identifier
FROM cancer_study
INNER JOIN patient p on cancer_study.CANCER_STUDY_ID = p.CANCER_STUDY_ID
INNER JOIN clinical_patient cp on p.INTERNAL_ID = cp.INTERNAL_ID
WHERE ATTR_VALUE NOT REGEXP '^[0-9.]+$';

Expand Down

0 comments on commit c1b940d

Please sign in to comment.