-
Notifications
You must be signed in to change notification settings - Fork 34
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'TASK-5564' into TASK-5387
- Loading branch information
Showing
7 changed files
with
261 additions
and
1 deletion.
There are no files selected for viewing
118 changes: 118 additions & 0 deletions
118
biodata-formats/src/main/java/org/opencb/biodata/formats/feature/mirbase/MirBaseParser.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,118 @@ | ||
package org.opencb.biodata.formats.feature.mirbase; | ||
|
||
import org.opencb.biodata.models.core.MiRnaGene; | ||
import org.opencb.biodata.models.core.MiRnaMature; | ||
import org.opencb.commons.utils.FileUtils; | ||
|
||
import java.io.BufferedReader; | ||
import java.io.IOException; | ||
import java.io.InputStreamReader; | ||
import java.nio.file.Path; | ||
|
||
public class MirBaseParser { | ||
|
||
private static final String ID_LABEL = "ID"; | ||
private static final String AC_LABEL = "AC"; | ||
private static final String DE_LABEL = "DE"; | ||
private static final String FT_LABEL = "FT"; | ||
private static final String SQ_LABEL = "SQ"; | ||
private static final String END_OF_ITEM_LABEL = "XX"; | ||
private static final String END_OF_RECORD_LABEL = "//"; | ||
|
||
private static final String MIRNA_LABEL = "miRNA"; | ||
|
||
private MirBaseParser() { | ||
throw new IllegalStateException("Utility class"); | ||
} | ||
|
||
public static void parse(Path miRnaDatFile, String species, MirBaseParserCallback callback) throws IOException { | ||
try (BufferedReader datReader = new BufferedReader(new InputStreamReader(FileUtils.newInputStream(miRnaDatFile)))) { | ||
String miRBaseAccession = null; | ||
String miRBaseID = null; | ||
MiRnaGene miRnaGene = null; | ||
String line; | ||
while ((line = datReader.readLine()) != null) { | ||
String[] split = line.split("\\s+"); | ||
switch (split[0]) { | ||
case ID_LABEL: { | ||
miRBaseID = split[1]; | ||
break; | ||
} | ||
case AC_LABEL: { | ||
miRBaseAccession = split[1].split(";")[0]; | ||
break; | ||
} | ||
case DE_LABEL: { | ||
if (line.contains(species)) { | ||
miRnaGene = new MiRnaGene(); | ||
miRnaGene.setId(miRBaseID) | ||
.setAccession(miRBaseAccession); | ||
} | ||
break; | ||
} | ||
case FT_LABEL: { | ||
if (miRnaGene != null && MIRNA_LABEL.equalsIgnoreCase(split[1])) { | ||
processMiRnaMature(line, miRnaGene, datReader); | ||
} | ||
break; | ||
} | ||
case SQ_LABEL: { | ||
if (miRnaGene != null) { | ||
StringBuilder seq = new StringBuilder(); | ||
// Read until END_OF_RECORD_LABEL | ||
while (!(line = datReader.readLine()).equals(END_OF_RECORD_LABEL)) { | ||
split = line.split("\\s+"); | ||
for (int i = 1; i < split.length - 1; i++) { | ||
seq.append(split[i]); | ||
} | ||
} | ||
miRnaGene.setSequence(seq.toString()); | ||
|
||
// Update mature sequences | ||
for (MiRnaMature mature : miRnaGene.getMatures()) { | ||
if (mature.getStart() > 0 && mature.getEnd() > 0) { | ||
mature.setSequence(miRnaGene.getSequence().substring(mature.getStart() - 1, mature.getEnd())); | ||
} | ||
} | ||
|
||
// Callback | ||
callback.processMiRnaGene(miRnaGene); | ||
miRnaGene = null; | ||
} | ||
break; | ||
} | ||
default: { | ||
// Do nothing | ||
break; | ||
} | ||
} | ||
} | ||
} | ||
} | ||
|
||
private static void processMiRnaMature(String headerLine, MiRnaGene miRnaGene, BufferedReader datReader) throws IOException { | ||
// Create MiRNA mature from header line, | ||
// e.g: FT miRNA 6..27 | ||
MiRnaMature miRnaMature = new MiRnaMature(); | ||
String[] split = headerLine.split("\\s+"); | ||
String[] pos = split[2].split("\\.\\."); | ||
miRnaMature.setStart(Integer.parseInt(pos[0])); | ||
miRnaMature.setEnd(Integer.parseInt(pos[1])); | ||
|
||
String line; | ||
while (!(line = datReader.readLine()).equals(END_OF_ITEM_LABEL)) { | ||
split = line.split("\\s+"); | ||
if (split[0].equalsIgnoreCase(FT_LABEL) && split[1].equalsIgnoreCase(MIRNA_LABEL)) { | ||
processMiRnaMature(line, miRnaGene, datReader); | ||
break; | ||
} else { | ||
if (line.contains("accession=")) { | ||
miRnaMature.setAccession(line.split("accession=")[1].replace("\"", "")); | ||
} else if (line.contains("product=")) { | ||
miRnaMature.setId(line.split("product=")[1].replace("\"", "")); | ||
} | ||
} | ||
} | ||
miRnaGene.getMatures().add(miRnaMature); | ||
} | ||
} |
7 changes: 7 additions & 0 deletions
7
...rmats/src/main/java/org/opencb/biodata/formats/feature/mirbase/MirBaseParserCallback.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
package org.opencb.biodata.formats.feature.mirbase; | ||
|
||
import org.opencb.biodata.models.core.MiRnaGene; | ||
|
||
public interface MirBaseParserCallback { | ||
boolean processMiRnaGene(MiRnaGene miRnaGene); | ||
} |
105 changes: 105 additions & 0 deletions
105
...a-formats/src/test/java/org/opencb/biodata/formats/feature/mirbase/MirBaseParserTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
package org.opencb.biodata.formats.feature.mirbase; | ||
|
||
import org.junit.Assert; | ||
import org.junit.Test; | ||
import org.opencb.biodata.models.core.MiRnaGene; | ||
import org.opencb.biodata.models.core.MiRnaMature; | ||
|
||
import java.io.IOException; | ||
import java.nio.file.Path; | ||
import java.nio.file.Paths; | ||
import java.util.ArrayList; | ||
import java.util.List; | ||
|
||
public class MirBaseParserTest { | ||
|
||
|
||
|
||
// Implementation of the MirBaseParserCallback function | ||
public class MyCallback implements MirBaseParserCallback { | ||
private String msg; | ||
private List<MiRnaGene> miRnaGenes; | ||
|
||
public MyCallback(String msg) { | ||
this.msg = msg; | ||
this.miRnaGenes = new ArrayList<>(); | ||
} | ||
|
||
@Override | ||
public boolean processMiRnaGene(MiRnaGene miRnaGene) { | ||
System.out.println(msg); | ||
System.out.println(miRnaGene.toString()); | ||
miRnaGenes.add(miRnaGene); | ||
return true; | ||
} | ||
|
||
public List<MiRnaGene> getMiRnaGenes() { | ||
return miRnaGenes; | ||
} | ||
|
||
public MiRnaGene getMiRnaGene(String accession) { | ||
for (MiRnaGene miRnaGene : miRnaGenes) { | ||
if (accession.equals(miRnaGene.getAccession())) { | ||
return miRnaGene; | ||
} | ||
} | ||
return null; | ||
} | ||
|
||
public int getCounter() { | ||
return miRnaGenes.size(); | ||
} | ||
} | ||
|
||
@Test | ||
public void testMirBaseParser() throws IOException { | ||
Path datFile = Paths.get(getClass().getResource("/miRNA.small.dat.gz").getPath()); | ||
|
||
MyCallback callback = new MyCallback(">>> Testing message"); | ||
|
||
MirBaseParser.parse(datFile, "Homo sapiens", callback); | ||
Assert.assertEquals(50, callback.getCounter()); | ||
|
||
MiRnaGene mi0000060 = callback.getMiRnaGene("MI0000060"); | ||
Assert.assertEquals("hsa-let-7a-1", mi0000060.getId()); | ||
Assert.assertEquals("ugggaUGAGGUAGUAGGUUGUAUAGUUuuagggucacacccaccacugggagauaaCUAUACAAUCUACUGUCUUUCcua".toUpperCase(), mi0000060.getSequence().toUpperCase()); | ||
int found = 0; | ||
for (MiRnaMature mature : mi0000060.getMatures()) { | ||
if ("MIMAT0000062".equals(mature.getAccession())) { | ||
found++; | ||
Assert.assertEquals("hsa-let-7a-5p", mature.getId()); | ||
Assert.assertEquals("UGAGGUAGUAGGUUGUAUAGUU".toUpperCase(), mature.getSequence().toUpperCase()); | ||
Assert.assertEquals(6, mature.getStart()); | ||
Assert.assertEquals(27, mature.getEnd()); | ||
} else if ("MIMAT0004481".equals(mature.getAccession())) { | ||
found++; | ||
Assert.assertEquals("hsa-let-7a-3p", mature.getId()); | ||
Assert.assertEquals("CUAUACAAUCUACUGUCUUUC".toUpperCase(), mature.getSequence().toUpperCase()); | ||
Assert.assertEquals(57, mature.getStart()); | ||
Assert.assertEquals(77, mature.getEnd()); | ||
} | ||
} | ||
Assert.assertEquals(2, found); | ||
|
||
MiRnaGene mi0000077 = callback.getMiRnaGene("MI0000077"); | ||
Assert.assertEquals("hsa-mir-21", mi0000077.getId()); | ||
Assert.assertEquals("ugucgggUAGCUUAUCAGACUGAUGUUGAcuguugaaucucauggCAACACCAGUCGAUGGGCUGUcugaca".toUpperCase(), mi0000077.getSequence().toUpperCase()); | ||
found = 0; | ||
for (MiRnaMature mature : mi0000077.getMatures()) { | ||
if ("MIMAT0000076".equals(mature.getAccession())) { | ||
found++; | ||
Assert.assertEquals("hsa-miR-21-5p", mature.getId()); | ||
Assert.assertEquals("UAGCUUAUCAGACUGAUGUUGA".toUpperCase(), mature.getSequence().toUpperCase()); | ||
Assert.assertEquals(8, mature.getStart()); | ||
Assert.assertEquals(29, mature.getEnd()); | ||
} else if ("MIMAT0004494".equals(mature.getAccession())) { | ||
found++; | ||
Assert.assertEquals("hsa-miR-21-3p", mature.getId()); | ||
Assert.assertEquals("CAACACCAGUCGAUGGGCUGU".toUpperCase(), mature.getSequence().toUpperCase()); | ||
Assert.assertEquals(46, mature.getStart()); | ||
Assert.assertEquals(66, mature.getEnd()); | ||
} | ||
} | ||
Assert.assertEquals(2, found); | ||
} | ||
} |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters