From 1960644fb74421684d378e6f760e8aae96877078 Mon Sep 17 00:00:00 2001 From: Adriano De Marino Date: Wed, 8 Feb 2023 19:57:31 +0100 Subject: [PATCH 01/15] mod_test for axiom --- src/snps/io/reader.py | 133 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 133 insertions(+) diff --git a/src/snps/io/reader.py b/src/snps/io/reader.py index a8037dc..a6c3e57 100644 --- a/src/snps/io/reader.py +++ b/src/snps/io/reader.py @@ -133,6 +133,65 @@ "MT": "MT", } +CHROMOSOME_AXIOM = { + "1": "1", + "2": "2", + "3": "3", + "4": "4", + "5": "5", + "6": "6", + "7": "7", + "8": "8", + "9": "9", + "10": "10", + "11": "11", + "12": "12", + "13": "13", + "14": "14", + "15": "15", + "16": "16", + "17": "17", + "18": "18", + "19": "19", + "20": "20", + "21": "21", + "22": "22", + "23": "X", + "24": "Y", + "25": "X", + "26": "MT", + 1: "1", + 2: "2", + 3: "3", + 4: "4", + 5: "5", + 6: "6", + 7: "7", + 8: "8", + 9: "9", + 10: "10", + 11: "11", + 12: "12", + 13: "13", + 14: "14", + 15: "15", + 16: "16", + 17: "17", + 18: "18", + 19: "19", + 20: "20", + 21: "21", + 22: "22", + 23: "X", + 24: "X", + 25: "Y", + 26: "MT", + "X": "X", + "Y": "Y", + "MT": "MT", +} + + def get_empty_snps_dataframe(): """Get empty dataframe normalized for usage with ``snps``. @@ -263,6 +322,11 @@ def read(self): elif re.match("^#*[ \t]*rsid[, \t]*chr", first_line): d = self.read_generic(file, compression) print('SNPs library reader: Generic 3') + + elif re.match("axiom", comments.lower()) | ("Functional Genomic Analysis" in first_line): + d = self.read_Axiom(file, compression) + print('SNPs library reader: Axiom') + elif re.match("^rs[0-9]*[, \t]{1}[1]", first_line): d = self.read_generic(file, compression, skip=0) print('SNPs library reader: Generic 4') @@ -1329,6 +1393,75 @@ def parse(sep): return self.read_helper("generic", parser) + def read_Axiom(self, file, compression, skip=1): + """Read and parse generic CSV or TSV file. + + Notes + ----- + Assumes columns are 'rsid', 'chrom' / 'chromosome', 'pos' / 'position', and 'genotype'; + values are comma separated; unreported genotypes are indicated by '--'; and one header row + precedes data. For example: + + rsid,chromosome,position,genotype + rs1,1,1,AA + rs2,1,2,CC + rs3,1,3,-- + + Parameters + ---------- + file : str + path to file + + Returns + ------- + dict + result of `read_helper` + """ + + def parser(): + def parse(sep): + df = pd.read_csv( + file, + sep=sep, + skiprows=skip, + na_values=NA_VALUES, + names=["rsid", "chrom", "pos", "genotype"], + usecols=[0, 1, 2, 3], + index_col=0, + dtype=NORMALIZED_DTYPES, + compression=compression, + ) + df["chrom"] = df["chrom"].map(CHROMOSOME_AXIOM) + return df + try: + df = parse(",") + except ValueError: + try: + if isinstance(file, io.BufferedIOBase): + file.seek(0) + + df = parse("\t") + except ValueError: + if isinstance(file, io.BufferedIOBase): + file.seek(0) + + df = pd.read_csv( + file, + sep=None, + na_values=NA_VALUES, + skiprows=skip, + engine="python", + names=["rsid", "chrom", "pos", "genotype"], + usecols=[0, 1, 2, 3], + index_col=0, + dtype=NORMALIZED_DTYPES, + compression=compression, + ) + df["chrom"] = df["chrom"].map(CHROMOSOME_AXIOM) + return (df,) + + return self.read_helper("generic", parser) + def read_vcf(self, file, compression, provider, rsids=()): """Read and parse VCF file. From e16b3149a003d7b21d2074edf39323c4480e6898 Mon Sep 17 00:00:00 2001 From: Adriano De Marino Date: Wed, 8 Feb 2023 20:01:21 +0100 Subject: [PATCH 02/15] mod_test for axiom --- src/snps/io/reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/snps/io/reader.py b/src/snps/io/reader.py index a6c3e57..ea3d889 100644 --- a/src/snps/io/reader.py +++ b/src/snps/io/reader.py @@ -323,7 +323,7 @@ def read(self): d = self.read_generic(file, compression) print('SNPs library reader: Generic 3') - elif re.match("axiom", comments.lower()) | ("Functional Genomic Analysis" in first_line): + elif re.match("axiom", comments.lower()) | re.match("Functional Genomic Analysis" in first_line): d = self.read_Axiom(file, compression) print('SNPs library reader: Axiom') From 70bd0a4e60fe5896cb9ebc6ae7832c6c1e6f60fe Mon Sep 17 00:00:00 2001 From: Adriano De Marino Date: Wed, 8 Feb 2023 20:03:18 +0100 Subject: [PATCH 03/15] mod_test for axiom --- src/snps/io/reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/snps/io/reader.py b/src/snps/io/reader.py index ea3d889..3b1d03a 100644 --- a/src/snps/io/reader.py +++ b/src/snps/io/reader.py @@ -323,7 +323,7 @@ def read(self): d = self.read_generic(file, compression) print('SNPs library reader: Generic 3') - elif re.match("axiom", comments.lower()) | re.match("Functional Genomic Analysis" in first_line): + elif ("axiom", comments.lower()) | ("Functional Genomic Analysis" in first_line): d = self.read_Axiom(file, compression) print('SNPs library reader: Axiom') From fd1ecb770642bf65490ef909d47755040a24933c Mon Sep 17 00:00:00 2001 From: Adriano De Marino Date: Wed, 8 Feb 2023 20:04:21 +0100 Subject: [PATCH 04/15] mod_test for axiom --- src/snps/io/reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/snps/io/reader.py b/src/snps/io/reader.py index 3b1d03a..d5c8229 100644 --- a/src/snps/io/reader.py +++ b/src/snps/io/reader.py @@ -323,7 +323,7 @@ def read(self): d = self.read_generic(file, compression) print('SNPs library reader: Generic 3') - elif ("axiom", comments.lower()) | ("Functional Genomic Analysis" in first_line): + elif ("axiom", comments.lower()) or ("Functional Genomic Analysis" in first_line): d = self.read_Axiom(file, compression) print('SNPs library reader: Axiom') From ab337dd9261ea0335762d4928f7478e481193d9b Mon Sep 17 00:00:00 2001 From: Adriano De Marino Date: Wed, 8 Feb 2023 20:11:54 +0100 Subject: [PATCH 05/15] mod_test for axiom --- src/snps/io/reader.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/snps/io/reader.py b/src/snps/io/reader.py index d5c8229..d0d4ef7 100644 --- a/src/snps/io/reader.py +++ b/src/snps/io/reader.py @@ -1423,8 +1423,9 @@ def parse(sep): df = pd.read_csv( file, sep=sep, - skiprows=skip, + comment='#', na_values=NA_VALUES, + header=None, names=["rsid", "chrom", "pos", "genotype"], usecols=[0, 1, 2, 3], index_col=0, @@ -1434,7 +1435,7 @@ def parse(sep): df["chrom"] = df["chrom"].map(CHROMOSOME_AXIOM) return df try: - df = parse(",") + df = parse("\t") except ValueError: try: if isinstance(file, io.BufferedIOBase): From f759bdedd5e72c440a9641a229b081f826b77046 Mon Sep 17 00:00:00 2001 From: Adriano De Marino Date: Wed, 8 Feb 2023 20:19:13 +0100 Subject: [PATCH 06/15] mod_test for axiom --- src/snps/io/reader.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/snps/io/reader.py b/src/snps/io/reader.py index d0d4ef7..80e76c9 100644 --- a/src/snps/io/reader.py +++ b/src/snps/io/reader.py @@ -157,8 +157,8 @@ "21": "21", "22": "22", "23": "X", - "24": "Y", - "25": "X", + "24": "X", + "25": "Y", "26": "MT", 1: "1", 2: "2", @@ -1433,6 +1433,7 @@ def parse(sep): compression=compression, ) df["chrom"] = df["chrom"].map(CHROMOSOME_AXIOM) + df = df[~df.chrom.isna()] return df try: df = parse("\t") From cdbe38e45fe77da6fdf0f4e6f95639b5d418e194 Mon Sep 17 00:00:00 2001 From: Adriano De Marino Date: Wed, 8 Feb 2023 20:41:41 +0100 Subject: [PATCH 07/15] mod_test for axiom --- src/snps/io/reader.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/snps/io/reader.py b/src/snps/io/reader.py index 80e76c9..9490946 100644 --- a/src/snps/io/reader.py +++ b/src/snps/io/reader.py @@ -1388,7 +1388,11 @@ def parse(sep): dtype=NORMALIZED_DTYPES, compression=compression, ) - df["chrom"] = df["chrom"].map(CHROMOSOME) + if df[df.chrom.isin(['24','25','26','27','28','29','30'])].chrom.nunique() == 7: + df["chrom"] = df["chrom"].map(CHROMOSOME_AXIOM) + df = df[~df.chrom.isna()] + else: + df["chrom"] = df["chrom"].map(CHROMOSOME) return (df,) return self.read_helper("generic", parser) From e8672c96821b1a343454bea136edfb8cb30e852f Mon Sep 17 00:00:00 2001 From: Adriano De Marino Date: Wed, 8 Feb 2023 20:48:05 +0100 Subject: [PATCH 08/15] mod_test for axiom --- src/snps/io/reader.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/snps/io/reader.py b/src/snps/io/reader.py index 9490946..5010122 100644 --- a/src/snps/io/reader.py +++ b/src/snps/io/reader.py @@ -1388,7 +1388,8 @@ def parse(sep): dtype=NORMALIZED_DTYPES, compression=compression, ) - if df[df.chrom.isin(['24','25','26','27','28','29','30'])].chrom.nunique() == 7: + resu_axi = all(elem in df.chrom.tolist() for elem in ['24','25','26','27','28','29','30']) + if resu_axi: df["chrom"] = df["chrom"].map(CHROMOSOME_AXIOM) df = df[~df.chrom.isna()] else: From a6077830780e15ba64138f9f1358c47aa1d99a8f Mon Sep 17 00:00:00 2001 From: Adriano De Marino Date: Wed, 8 Feb 2023 20:52:19 +0100 Subject: [PATCH 09/15] mod_test for axiom --- src/snps/io/reader.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/snps/io/reader.py b/src/snps/io/reader.py index 5010122..cc3814d 100644 --- a/src/snps/io/reader.py +++ b/src/snps/io/reader.py @@ -1362,7 +1362,12 @@ def parse(sep): dtype=NORMALIZED_DTYPES, compression=compression, ) - df["chrom"] = df["chrom"].map(CHROMOSOME) + resu_axi = all(elem in df.chrom.tolist() for elem in ['24','25','26','27','28','29','30']) + if resu_axi: + df["chrom"] = df["chrom"].map(CHROMOSOME_AXIOM) + df = df[~df.chrom.isna()] + else: + df["chrom"] = df["chrom"].map(CHROMOSOME) return df try: df = parse(",") @@ -1388,12 +1393,7 @@ def parse(sep): dtype=NORMALIZED_DTYPES, compression=compression, ) - resu_axi = all(elem in df.chrom.tolist() for elem in ['24','25','26','27','28','29','30']) - if resu_axi: - df["chrom"] = df["chrom"].map(CHROMOSOME_AXIOM) - df = df[~df.chrom.isna()] - else: - df["chrom"] = df["chrom"].map(CHROMOSOME) + df["chrom"] = df["chrom"].map(CHROMOSOME) return (df,) return self.read_helper("generic", parser) From e73193817c00b7c44fa4e1e1f7d6d4dd7ef5da12 Mon Sep 17 00:00:00 2001 From: Adriano De Marino Date: Wed, 8 Feb 2023 21:00:54 +0100 Subject: [PATCH 10/15] mod_test for axiom --- src/snps/io/reader.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/snps/io/reader.py b/src/snps/io/reader.py index cc3814d..b0f3fb2 100644 --- a/src/snps/io/reader.py +++ b/src/snps/io/reader.py @@ -1363,6 +1363,7 @@ def parse(sep): compression=compression, ) resu_axi = all(elem in df.chrom.tolist() for elem in ['24','25','26','27','28','29','30']) + print("Sto ca") if resu_axi: df["chrom"] = df["chrom"].map(CHROMOSOME_AXIOM) df = df[~df.chrom.isna()] From fbcfee5f8a7229ee4d65bfcd04fadaeeb3298035 Mon Sep 17 00:00:00 2001 From: Adriano De Marino Date: Wed, 8 Feb 2023 22:16:50 +0100 Subject: [PATCH 11/15] mod_test for axiom --- src/snps/io/reader.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/snps/io/reader.py b/src/snps/io/reader.py index b0f3fb2..e4b5161 100644 --- a/src/snps/io/reader.py +++ b/src/snps/io/reader.py @@ -1364,6 +1364,7 @@ def parse(sep): ) resu_axi = all(elem in df.chrom.tolist() for elem in ['24','25','26','27','28','29','30']) print("Sto ca") + print(resu_axi) if resu_axi: df["chrom"] = df["chrom"].map(CHROMOSOME_AXIOM) df = df[~df.chrom.isna()] From 6981915283cbc8aaa75419ad0a5d20e3be313caf Mon Sep 17 00:00:00 2001 From: Adriano De Marino Date: Wed, 8 Feb 2023 22:17:41 +0100 Subject: [PATCH 12/15] mod_test for axiom --- src/snps/io/reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/snps/io/reader.py b/src/snps/io/reader.py index e4b5161..52bfe41 100644 --- a/src/snps/io/reader.py +++ b/src/snps/io/reader.py @@ -1364,7 +1364,7 @@ def parse(sep): ) resu_axi = all(elem in df.chrom.tolist() for elem in ['24','25','26','27','28','29','30']) print("Sto ca") - print(resu_axi) + print(df.chrom.tolist()) if resu_axi: df["chrom"] = df["chrom"].map(CHROMOSOME_AXIOM) df = df[~df.chrom.isna()] From 7cde43f213d23e38006f40bee80dfe423793a121 Mon Sep 17 00:00:00 2001 From: Adriano De Marino Date: Wed, 8 Feb 2023 22:18:54 +0100 Subject: [PATCH 13/15] mod_test for axiom --- src/snps/io/reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/snps/io/reader.py b/src/snps/io/reader.py index 52bfe41..cd3e29a 100644 --- a/src/snps/io/reader.py +++ b/src/snps/io/reader.py @@ -1364,7 +1364,7 @@ def parse(sep): ) resu_axi = all(elem in df.chrom.tolist() for elem in ['24','25','26','27','28','29','30']) print("Sto ca") - print(df.chrom.tolist()) + print(df.chrom.unique().tolist()) if resu_axi: df["chrom"] = df["chrom"].map(CHROMOSOME_AXIOM) df = df[~df.chrom.isna()] From 552c6c197b7f1eab5a1f9e738e3572f41dcdf9a7 Mon Sep 17 00:00:00 2001 From: Adriano De Marino Date: Wed, 8 Feb 2023 22:49:11 +0100 Subject: [PATCH 14/15] mod_test for axiom --- src/snps/io/reader.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/src/snps/io/reader.py b/src/snps/io/reader.py index cd3e29a..4269b63 100644 --- a/src/snps/io/reader.py +++ b/src/snps/io/reader.py @@ -1362,14 +1362,8 @@ def parse(sep): dtype=NORMALIZED_DTYPES, compression=compression, ) - resu_axi = all(elem in df.chrom.tolist() for elem in ['24','25','26','27','28','29','30']) - print("Sto ca") - print(df.chrom.unique().tolist()) - if resu_axi: - df["chrom"] = df["chrom"].map(CHROMOSOME_AXIOM) - df = df[~df.chrom.isna()] - else: - df["chrom"] = df["chrom"].map(CHROMOSOME) + df["chrom"] = df["chrom"].map(CHROMOSOME) + df = df[~df.chrom.isna()] return df try: df = parse(",") From ac360664f141ec7609e5a1a5004ff687aab976ec Mon Sep 17 00:00:00 2001 From: Adriano De Marino Date: Mon, 3 Apr 2023 20:09:57 +0200 Subject: [PATCH 15/15] changing name function to FGA --- src/snps/io/reader.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/snps/io/reader.py b/src/snps/io/reader.py index 4269b63..3d97890 100644 --- a/src/snps/io/reader.py +++ b/src/snps/io/reader.py @@ -133,7 +133,7 @@ "MT": "MT", } -CHROMOSOME_AXIOM = { +CHROMOSOME_FGA = { "1": "1", "2": "2", "3": "3", @@ -323,9 +323,9 @@ def read(self): d = self.read_generic(file, compression) print('SNPs library reader: Generic 3') - elif ("axiom", comments.lower()) or ("Functional Genomic Analysis" in first_line): - d = self.read_Axiom(file, compression) - print('SNPs library reader: Axiom') + elif ("axiom", comments.lower()) and ("Functional Genomic Analysis" in first_line): + d = self.read_FGA(file, compression) + print('SNPs library reader: Functional Genomic Analysis') elif re.match("^rs[0-9]*[, \t]{1}[1]", first_line): d = self.read_generic(file, compression, skip=0) @@ -1394,7 +1394,7 @@ def parse(sep): return self.read_helper("generic", parser) - def read_Axiom(self, file, compression, skip=1): + def read_FGA(self, file, compression, skip=1): """Read and parse generic CSV or TSV file. Notes @@ -1433,7 +1433,7 @@ def parse(sep): dtype=NORMALIZED_DTYPES, compression=compression, ) - df["chrom"] = df["chrom"].map(CHROMOSOME_AXIOM) + df["chrom"] = df["chrom"].map(CHROMOSOME_FGA) df = df[~df.chrom.isna()] return df try: @@ -1460,7 +1460,7 @@ def parse(sep): dtype=NORMALIZED_DTYPES, compression=compression, ) - df["chrom"] = df["chrom"].map(CHROMOSOME_AXIOM) + df["chrom"] = df["chrom"].map(CHROMOSOME_FGA) return (df,) return self.read_helper("generic", parser)