From 9429c1c6716a603eb0617cb4372e52e36f3e90d5 Mon Sep 17 00:00:00 2001 From: Matt Prestegaard Date: Fri, 30 Jun 2017 21:23:45 -0500 Subject: [PATCH] Allow for note to be added (third argument) upon input --- .../valid/LinkageDisequilibriumAnalyzer.java | 24 +----- .../src/main/java/org/dash/valid/Sample.java | 5 ++ .../org/dash/valid/gl/GLStringUtilities.java | 76 ++++++++++++++----- .../gl/LinkageDisequilibriumGenotypeList.java | 9 +++ .../main/resources/schema/LinkageFindings.xsd | 1 + .../LinkageDisequilibriumAnalyzerTest.java | 7 +- .../test/java/org/dash/gl/GLStringTest.java | 20 ++--- .../org/dash/gl/GLStringUtilitiesTest.java | 22 +++--- .../test/resources/tabDelimitedExample.txt | 2 +- 9 files changed, 96 insertions(+), 70 deletions(-) diff --git a/ld-validation/src/main/java/org/dash/valid/LinkageDisequilibriumAnalyzer.java b/ld-validation/src/main/java/org/dash/valid/LinkageDisequilibriumAnalyzer.java index ff8694e..03e1f4f 100644 --- a/ld-validation/src/main/java/org/dash/valid/LinkageDisequilibriumAnalyzer.java +++ b/ld-validation/src/main/java/org/dash/valid/LinkageDisequilibriumAnalyzer.java @@ -24,9 +24,7 @@ import java.io.BufferedReader; import java.io.IOException; import java.util.ArrayList; -import java.util.LinkedHashMap; import java.util.List; -import java.util.Map; import java.util.logging.Handler; import java.util.logging.LogManager; import java.util.logging.Logger; @@ -89,7 +87,7 @@ private static void analyzeGLStringFiles(String[] filenames) throws IOException } public static List analyzeGLStringFile(String name, BufferedReader reader) throws IOException { - LinkedHashMap glStrings = GLStringUtilities.readGLStringFile(name, reader); + List glStrings = GLStringUtilities.readGLStringFile(name, reader); List samplesList = detectLinkages(glStrings); @@ -100,7 +98,7 @@ public static List analyzeGLStringFile(String name, BufferedReader reade * @param filename */ public static void analyzeGLStringFile(String filename) throws IOException { - LinkedHashMap glStrings = GLStringUtilities.readGLStringFile(filename); + List glStrings = GLStringUtilities.readGLStringFile(filename); List samplesList = null; samplesList = detectLinkages(glStrings); @@ -111,7 +109,6 @@ public static void analyzeGLStringFile(String filename) throws IOException { HaplotypePairWriter.getInstance().reportDetectedLinkages(findings); CommonWellDocumentedWriter.getInstance().reportCommonWellDocumented(findings); DetectedFindingsWriter.getInstance().reportDetectedFindings(findings); - //SummaryWriter.getInstance().reportDetectedLinkages(findings); } SamplesList allSamples = new SamplesList(); @@ -125,24 +122,11 @@ public static void analyzeGLStringFile(String filename) throws IOException { * @throws IOException * @throws SecurityException */ - private static List detectLinkages(Map glStrings) { - LinkageDisequilibriumGenotypeList linkedGLString; - String glString; + private static List detectLinkages(List glStrings) { List samplesList = new ArrayList(); int idx = 1; - for (String key : glStrings.keySet()) { - glString = glStrings.get(key); - String submittedGlString = glString; - - if (!GLStringUtilities.validateGLStringFormat(glString)) { - glString = GLStringUtilities.fullyQualifyGLString(glString); - } - - MultilocusUnphasedGenotype mug = GLStringUtilities.convertToMug(glString); - linkedGLString = new LinkageDisequilibriumGenotypeList(key, mug); - - linkedGLString.setSubmittedGlString(submittedGlString); + for (LinkageDisequilibriumGenotypeList linkedGLString : glStrings) { List knownHaplotypes = GLStringUtilities.buildHaplotypes(linkedGLString); diff --git a/ld-validation/src/main/java/org/dash/valid/Sample.java b/ld-validation/src/main/java/org/dash/valid/Sample.java index d152924..8abc1bb 100644 --- a/ld-validation/src/main/java/org/dash/valid/Sample.java +++ b/ld-validation/src/main/java/org/dash/valid/Sample.java @@ -20,6 +20,11 @@ public String getId() { return getGenotypeList().getId(); } + @XmlAttribute(name="note") + public String getNote() { + return getGenotypeList().getNote(); + } + @XmlElement(name="processed-gl-string") public String getProcessedGlString() { return getGenotypeList().getGLString().equals(getGenotypeList().getSubmittedGlString()) ? null : getGenotypeList().getGLString(); diff --git a/ld-validation/src/main/java/org/dash/valid/gl/GLStringUtilities.java b/ld-validation/src/main/java/org/dash/valid/gl/GLStringUtilities.java index 3bcbd5a..39d52ae 100644 --- a/ld-validation/src/main/java/org/dash/valid/gl/GLStringUtilities.java +++ b/ld-validation/src/main/java/org/dash/valid/gl/GLStringUtilities.java @@ -32,7 +32,6 @@ import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; -import java.util.LinkedHashMap; import java.util.List; import java.util.Set; import java.util.StringTokenizer; @@ -416,11 +415,11 @@ public static String fillLocus(Locus locus, String segment) { return segment; } - public static LinkedHashMap readGLStringFile(String name, BufferedReader reader) { - LinkedHashMap glStrings = null; + public static List readGLStringFile(String name, BufferedReader reader) { + List linkedGLStrings = null; try { - glStrings = parseGLStringFile(name, reader); + linkedGLStrings = parseGLStringFile(name, reader); } catch (IOException e) { LOGGER.severe("Problem reading GL String file: " + name); e.printStackTrace(); @@ -430,12 +429,12 @@ public static LinkedHashMap readGLStringFile(String name, Buffer e.printStackTrace(); } - return glStrings; + return linkedGLStrings; } - public static LinkedHashMap readGLStringFile(String filename) { + public static List readGLStringFile(String filename) { BufferedReader reader = null; - LinkedHashMap glStrings = null; + List linkedGLStrings = null; try { InputStream stream = GLStringUtilities.class.getClassLoader() @@ -446,7 +445,7 @@ public static LinkedHashMap readGLStringFile(String filename) { reader = new BufferedReader(new InputStreamReader(stream)); - glStrings = parseGLStringFile(filename, reader); + linkedGLStrings = parseGLStringFile(filename, reader); } catch (FileNotFoundException e) { LOGGER.severe("Couldn't find GL String file: " + filename); @@ -465,14 +464,15 @@ public static LinkedHashMap readGLStringFile(String filename) { e.printStackTrace(); } } - - return glStrings; + + return linkedGLStrings; } - private static LinkedHashMap parseGLStringFile(String filename, + private static List parseGLStringFile(String filename, BufferedReader reader) throws IOException, ParserConfigurationException, SAXException { - LinkedHashMap glStrings = new LinkedHashMap(); + List linkedGLStrings = new ArrayList(); + if (filename.endsWith(GLStringConstants.XML) || filename.endsWith(GLStringConstants.HML)) { DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); @@ -492,29 +492,63 @@ private static LinkedHashMap parseGLStringFile(String filename, if (j > 0) glString.append(GLStringConstants.GENE_DELIMITER); glString.append(((Element) alleleAssignment.getElementsByTagName(GLStringConstants.GL_STRING_ELEMENT).item(0)).getTextContent().trim()); } - glStrings.put(sampleId, glString.toString()); + + linkedGLStrings.add(inflateGenotypeList(sampleId, glString.toString(), null)); } } else { String line; String[] parts = null; int lineNumber = 0; + String glString; + String id; + String note = null; + while ((line = reader.readLine()) != null) { + lineNumber++; + parts = line.split(FILE_DELIMITER_REGEX); + if (parts.length == 1) { - glStrings.put(filename + "-" + lineNumber, parts[0]); - } else if (parts.length == 2) { - glStrings.put(parts[0], parts[1]); - } else { + id = filename + "-" + (lineNumber - 1); + glString = parts[0]; + } else if (parts.length >= 2) { + id = parts[0]; + glString = parts[1]; + + if (parts.length == 3) note = parts[2]; + } + else { LOGGER.warning("Unexpected line format at line " - + lineNumber + ": " + filename); + + (lineNumber - 1) + ": " + filename); + + continue; } - - lineNumber++; + + linkedGLStrings.add(inflateGenotypeList(id, glString, note)); + } } + + return linkedGLStrings; + } + + private static LinkageDisequilibriumGenotypeList inflateGenotypeList(String id, String glString, String note) { + LinkageDisequilibriumGenotypeList linkedGLString; + + String submittedGlString = glString; + + if (!GLStringUtilities.validateGLStringFormat(glString)) { + glString = GLStringUtilities.fullyQualifyGLString(glString); + } + + MultilocusUnphasedGenotype mug = GLStringUtilities.convertToMug(glString); + linkedGLString = new LinkageDisequilibriumGenotypeList(id, mug); + + linkedGLString.setSubmittedGlString(submittedGlString); + linkedGLString.setNote(note); - return glStrings; + return linkedGLString; } public static MultilocusUnphasedGenotype convertToMug(String glString) { diff --git a/ld-validation/src/main/java/org/dash/valid/gl/LinkageDisequilibriumGenotypeList.java b/ld-validation/src/main/java/org/dash/valid/gl/LinkageDisequilibriumGenotypeList.java index 011c0bc..01ce9a9 100644 --- a/ld-validation/src/main/java/org/dash/valid/gl/LinkageDisequilibriumGenotypeList.java +++ b/ld-validation/src/main/java/org/dash/valid/gl/LinkageDisequilibriumGenotypeList.java @@ -46,6 +46,7 @@ public class LinkageDisequilibriumGenotypeList { private String id; private String glString; + private String note; private String submittedGlString; private MultilocusUnphasedGenotype mug; @@ -105,6 +106,14 @@ public LinkageDisequilibriumGenotypeList(String id, MultilocusUnphasedGenotype m } } + public String getNote() { + return note; + } + + public void setNote(String note) { + this.note = note; + } + public String getSubmittedGlString() { return submittedGlString; } diff --git a/ld-validation/src/main/resources/schema/LinkageFindings.xsd b/ld-validation/src/main/resources/schema/LinkageFindings.xsd index 46d83ca..cab0b32 100644 --- a/ld-validation/src/main/resources/schema/LinkageFindings.xsd +++ b/ld-validation/src/main/resources/schema/LinkageFindings.xsd @@ -15,6 +15,7 @@ + diff --git a/ld-validation/src/test/java/org/dash/LinkageDisequilibriumAnalyzerTest.java b/ld-validation/src/test/java/org/dash/LinkageDisequilibriumAnalyzerTest.java index d69ebf6..ef6f8a0 100644 --- a/ld-validation/src/test/java/org/dash/LinkageDisequilibriumAnalyzerTest.java +++ b/ld-validation/src/test/java/org/dash/LinkageDisequilibriumAnalyzerTest.java @@ -22,7 +22,6 @@ package org.dash; import java.io.IOException; -import java.util.LinkedHashMap; import java.util.List; import org.dash.valid.HLALinkageDisequilibrium; @@ -49,10 +48,10 @@ public void testLinkageReportingExamples() { @Test public void testLinkageReportingMugs() throws IOException { - LinkedHashMap glStrings = GLStringUtilities.readGLStringFile("fullyQualifiedExample.txt"); + List glStrings = GLStringUtilities.readGLStringFile("fullyQualifiedExample.txt"); - for (String key : glStrings.keySet()) { - MultilocusUnphasedGenotype mug = GLStringUtilities.convertToMug(glStrings.get(key)); + for (LinkageDisequilibriumGenotypeList linkedGLString : glStrings) { + MultilocusUnphasedGenotype mug = GLStringUtilities.convertToMug(linkedGLString.getGLString()); assertNotNull(mug); diff --git a/ld-validation/src/test/java/org/dash/gl/GLStringTest.java b/ld-validation/src/test/java/org/dash/gl/GLStringTest.java index bc19681..fbd333e 100644 --- a/ld-validation/src/test/java/org/dash/gl/GLStringTest.java +++ b/ld-validation/src/test/java/org/dash/gl/GLStringTest.java @@ -25,8 +25,7 @@ import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; -import java.util.LinkedHashMap; -import java.util.Set; +import java.util.List; import org.dash.valid.Locus; import org.dash.valid.gl.GLStringUtilities; @@ -47,20 +46,13 @@ public class GLStringTest extends TestCase { @Before public void setUp() throws IOException { - LinkedHashMap validGLStrings = GLStringUtilities.readGLStringFile("fullyQualifiedExample.txt"); + List validGLStrings = GLStringUtilities.readGLStringFile("fullyQualifiedExample.txt"); + + glString = validGLStrings.get(0); - Set keys = validGLStrings.keySet(); + List strictGLStrings = GLStringUtilities.readGLStringFile("strictExample.txt"); - for (String key : keys) { - glString = new LinkageDisequilibriumGenotypeList(key, GLStringUtilities.fullyQualifyGLString(validGLStrings.get(key))); - } - - LinkedHashMap strictGLStrings = GLStringUtilities.readGLStringFile("strictExample.txt"); - - keys = strictGLStrings.keySet(); - String key = keys.iterator().next(); - - STRICT_GL_STRING = strictGLStrings.get(key); + STRICT_GL_STRING = strictGLStrings.get(0).getGLString(); } @Test diff --git a/ld-validation/src/test/java/org/dash/gl/GLStringUtilitiesTest.java b/ld-validation/src/test/java/org/dash/gl/GLStringUtilitiesTest.java index 92f035e..a3ebd3f 100644 --- a/ld-validation/src/test/java/org/dash/gl/GLStringUtilitiesTest.java +++ b/ld-validation/src/test/java/org/dash/gl/GLStringUtilitiesTest.java @@ -22,7 +22,6 @@ package org.dash.gl; import java.io.IOException; -import java.util.LinkedHashMap; import java.util.List; import java.util.Set; @@ -30,6 +29,7 @@ import org.dash.valid.freq.HLAFrequenciesLoader; import org.dash.valid.gl.GLStringConstants; import org.dash.valid.gl.GLStringUtilities; +import org.dash.valid.gl.LinkageDisequilibriumGenotypeList; import org.junit.Test; import org.nmdp.gl.MultilocusUnphasedGenotype; @@ -56,6 +56,7 @@ public class GLStringUtilitiesTest extends TestCase { private static final String VALID_GL_STRING_MAC = "HLA-A*01:01/HLA-A*01:02+HLA-A*26:01^HLA-C*01:01/HLA-C*01:03+HLA-C*04:01"; private static final String TAB_DELIMITED = "TAB_DELIMITED"; private static final String COMMA_DELIMITED = "COMMA_DELIMITED"; + private static final String MY_NOTE = "My Note"; @Test public void testParse() { @@ -79,25 +80,26 @@ public void testHasFrequency() throws IOException { @Test public void testTabDelimitedGLStringFile() { - LinkedHashMap glStrings = GLStringUtilities.readGLStringFile("tabDelimitedExample.txt"); - for (String key : glStrings.keySet()) { - assertTrue(TAB_DELIMITED.equals(key)); + List glStrings = GLStringUtilities.readGLStringFile("tabDelimitedExample.txt"); + for (LinkageDisequilibriumGenotypeList linkedGLString : glStrings) { + assertTrue(TAB_DELIMITED.equals(linkedGLString.getId())); + assertTrue(MY_NOTE.equals(linkedGLString.getNote())); } } @Test public void testCommaDelimitedGLStringFile() { - LinkedHashMap glStrings = GLStringUtilities.readGLStringFile("commaDelimitedExample.txt"); - for (String key : glStrings.keySet()) { - assertTrue(COMMA_DELIMITED.equals(key)); + List glStrings = GLStringUtilities.readGLStringFile("commaDelimitedExample.txt"); + for (LinkageDisequilibriumGenotypeList linkedGLString : glStrings) { + assertTrue(COMMA_DELIMITED.equals(linkedGLString.getId())); } } @Test public void testHMLFile() { - LinkedHashMap glStrings = GLStringUtilities.readGLStringFile("hml_1_0_2-example7-ngsFull.xml"); - for (String key : glStrings.keySet()) { - assertTrue("1367-7150-8".equals(key)); + List glStrings = GLStringUtilities.readGLStringFile("hml_1_0_2-example7-ngsFull.xml"); + for (LinkageDisequilibriumGenotypeList linkedGLString : glStrings) { + assertTrue("1367-7150-8".equals(linkedGLString.getId())); } } diff --git a/ld-validation/src/test/resources/tabDelimitedExample.txt b/ld-validation/src/test/resources/tabDelimitedExample.txt index ae84985..51c0fbd 100644 --- a/ld-validation/src/test/resources/tabDelimitedExample.txt +++ b/ld-validation/src/test/resources/tabDelimitedExample.txt @@ -1 +1 @@ -TAB_DELIMITED HLA-A*01:01:01:01+HLA-A*26:01:01^HLA-B*38:01:01/HLA-B*38:27+HLA-B*44:03:01/HLA-B*44:03:10/HLA-B*44:125^HLA-C*04:01:01:01/HLA-C*04:01:01:02/HLA-C*04:01:01:03/HLA-C*04:01:01:04/HLA-C*04:01:01:05/HLA-C*04:20/HLA-C*04:117+HLA-C*12:03:01:01/HLA-C*12:03:01:02/HLA-C*12:34^HLA-DPA1*01:03:01:01/HLA-DPA1*01:03:01:02/HLA-DPA1*01:03:01:03/HLA-DPA1*01:03:01:04/HLA-DPA1*01:03:01:05+HLA-DPA1*01:03:01:01/HLA-DPA1*01:03:01:02/HLA-DPA1*01:03:01:03/HLA-DPA1*01:03:01:04/HLA-DPA1*01:03:01:05^HLA-DPB1*04:01:01:01/HLA-DPB1*04:01:01:02+HLA-DPB1*04:01:01:01/HLA-DPB1*04:01:01:02^HLA-DQA1*02:01+HLA-DQA1*05:05:01:01/HLA-DQA1*05:05:01:02/HLA-DQA1*05:05:01:03/HLA-DQA1*05:09/HLA-DQA1*05:11^HLA-DQB1*02:02+HLA-DQB1*03:01:01:01/HLA-DQB1*03:01:01:02/HLA-DQB1*03:01:01:03^HLA-DRB1*07:01:01:01/HLA-DRB1*07:01:01:02+HLA-DRB1*11:01:01^HLA-DRB3*02:02:01:01/HLA-DRB3*02:02:01:02^HLA-DRB4*01:01:01:01/HLA-DRB4*03:01N \ No newline at end of file +TAB_DELIMITED HLA-A*01:01:01:01+HLA-A*26:01:01^HLA-B*38:01:01/HLA-B*38:27+HLA-B*44:03:01/HLA-B*44:03:10/HLA-B*44:125^HLA-C*04:01:01:01/HLA-C*04:01:01:02/HLA-C*04:01:01:03/HLA-C*04:01:01:04/HLA-C*04:01:01:05/HLA-C*04:20/HLA-C*04:117+HLA-C*12:03:01:01/HLA-C*12:03:01:02/HLA-C*12:34^HLA-DPA1*01:03:01:01/HLA-DPA1*01:03:01:02/HLA-DPA1*01:03:01:03/HLA-DPA1*01:03:01:04/HLA-DPA1*01:03:01:05+HLA-DPA1*01:03:01:01/HLA-DPA1*01:03:01:02/HLA-DPA1*01:03:01:03/HLA-DPA1*01:03:01:04/HLA-DPA1*01:03:01:05^HLA-DPB1*04:01:01:01/HLA-DPB1*04:01:01:02+HLA-DPB1*04:01:01:01/HLA-DPB1*04:01:01:02^HLA-DQA1*02:01+HLA-DQA1*05:05:01:01/HLA-DQA1*05:05:01:02/HLA-DQA1*05:05:01:03/HLA-DQA1*05:09/HLA-DQA1*05:11^HLA-DQB1*02:02+HLA-DQB1*03:01:01:01/HLA-DQB1*03:01:01:02/HLA-DQB1*03:01:01:03^HLA-DRB1*07:01:01:01/HLA-DRB1*07:01:01:02+HLA-DRB1*11:01:01^HLA-DRB3*02:02:01:01/HLA-DRB3*02:02:01:02^HLA-DRB4*01:01:01:01/HLA-DRB4*03:01N My Note \ No newline at end of file