Skip to content

Commit

Permalink
Merge pull request #26 from monarch-initiative/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
pnrobinson authored Jun 7, 2024
2 parents 50e31d5 + ce6633d commit 8c513ae
Show file tree
Hide file tree
Showing 46 changed files with 3,401 additions and 215 deletions.
1 change: 1 addition & 0 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ nav:
- "Template": 'languages.md'
- "English": "english.md"
- Setup: "setup.md"
- Batch: "batch.md"

plugins:
- search
Expand Down
6 changes: 3 additions & 3 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

<groupId>org.monarchinitiative</groupId>
<artifactId>phenopacket2prompt</artifactId>
<version>0.3.14</version>
<version>0.4.0</version>

<name>phenopacket2prompt</name>
<url>https://github.com/monarch-initiative/phenopacket2prompt</url>
Expand Down Expand Up @@ -186,8 +186,8 @@
<artifactId>maven-compiler-plugin</artifactId>
<version>3.8.1</version>
<configuration>
<source>${java.version}</source>
<target>${java.version}</target>
<source>21</source>
<target>21</target>
<!-- <release>1.8</release> -->
</configuration>
</plugin>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import org.monarchinitiative.phenopacket2prompt.model.PhenopacketDisease;
import org.monarchinitiative.phenopacket2prompt.model.PpktIndividual;
import org.monarchinitiative.phenopacket2prompt.output.CorrectResult;
import org.monarchinitiative.phenopacket2prompt.output.PpktCopy;
import org.monarchinitiative.phenopacket2prompt.output.PromptGenerator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand All @@ -23,6 +24,7 @@
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.Callable;

@CommandLine.Command(name = "batch", aliases = {"B"},
Expand All @@ -40,9 +42,16 @@ public class GbtTranslateBatchCommand implements Callable<Integer> {
description = "path to translations file")
private String translationsPath = "data/hp-international.obo";

@CommandLine.Option(names = {"-o", "--outdir"},
description = "path to outdir")
private String outdirname = "prompts";

@CommandLine.Option(names = {"-d", "--dir"}, description = "Path to directory with JSON phenopacket files", required = true)
private String ppktDir;

private String currentLanguageCode = null;
private int currentCount;

@Override
public Integer call() throws Exception {
File hpJsonFile = new File(hpoJsonPath);
Expand All @@ -57,19 +66,51 @@ public Integer call() throws Exception {
return 1;
}
HpInternationalOboParser oboParser = new HpInternationalOboParser(translationsFile);

Map<String, HpInternational> internationalMap = oboParser.getLanguageToInternationalMap();
LOGGER.info("Got {} translations", internationalMap.size());
List<File> ppktFiles = getAllPhenopacketJsonFiles();
createDir("prompts");
createDir(outdirname);
List<CorrectResult> correctResultList = outputPromptsEnglish(ppktFiles, hpo);
// output all non-English languages here
PromptGenerator spanish = PromptGenerator.spanish(hpo, internationalMap.get("es"));

// SPANISH
PromptGenerator spanish = PromptGenerator.spanish(internationalMap.get("es"));
resetOutput("es");
outputPromptsInternational(ppktFiles, hpo, "es", spanish);

resetOutput("nl");
PromptGenerator dutch = PromptGenerator.dutch(internationalMap.get("nl"));
outputPromptsInternational(ppktFiles, hpo, "nl", dutch);
// GERMAN
resetOutput("de");
PromptGenerator german = PromptGenerator.german(internationalMap.get("de"));
outputPromptsInternational(ppktFiles, hpo, "de", german);

// ITALIAN
resetOutput("it");
PromptGenerator italian = PromptGenerator.italian(internationalMap.get("it"));
outputPromptsInternational(ppktFiles, hpo, "it", italian);
resetOutput("finished");
// output original phenopackets
PpktCopy pcopy = new PpktCopy(new File(outdirname));
for (var file : ppktFiles) {
pcopy.copyFile(file);
}

// output file with correct diagnosis list
outputCorrectResults(correctResultList);
return 0;
}

private void resetOutput(String es) {
if (currentLanguageCode != null) {
System.out.printf("Finished writing %d phenopackets in %s\n", currentCount, currentLanguageCode);
}
currentLanguageCode = es;
currentCount = 0;
}

private void outputCorrectResults(List<CorrectResult> correctResultList) {
File outfile = new File("prompts" + File.separator + "correct_results.tsv");
try (BufferedWriter bw = new BufferedWriter(new FileWriter(outfile))) {
Expand All @@ -79,12 +120,12 @@ private void outputCorrectResults(List<CorrectResult> correctResultList) {
} catch (IOException e) {
e.printStackTrace();
}
System.out.printf("[INFO] Output a total of %d prompts in en and es.\n", correctResultList.size());
System.out.printf("[INFO] Output a total of %d prompts in en, es, nl, de, and it.\n", correctResultList.size());
}


private String getFileName(String phenopacketID) {
return phenopacketID.replaceAll("[^\\w]", phenopacketID).replaceAll("/","_") + "-prompt.txt";
private String getFileName(String phenopacketID, String languageCode) {
return phenopacketID.replaceAll("[^\\w]","_") + "_" + languageCode + "-prompt.txt";
}


Expand All @@ -94,21 +135,28 @@ private void outputPromptsInternational(List<File> ppktFiles, Ontology hpo, Stri
createDir(dirpath);
List<String> diagnosisList = new ArrayList<>();
for (var f: ppktFiles) {
PpktIndividual individual = new PpktIndividual(f);
PpktIndividual individual = PpktIndividual.fromFile(f);
List<PhenopacketDisease> diseaseList = individual.getDiseases();
if (diseaseList.size() != 1) {
System.err.println(String.format("[ERROR] Got %d diseases for %s.\n", diseaseList.size(), individual.getPhenopacketId()));
continue;
String errmsg = String.format("[ERROR] Got %d diseases for %s.\n", diseaseList.size(), individual.getPhenopacketId());
throw new PhenolRuntimeException(errmsg);
}
PhenopacketDisease pdisease = diseaseList.get(0);
String promptFileName = getFileName( individual.getPhenopacketId());
String promptFileName = getFileName( individual.getPhenopacketId(), languageCode);
String diagnosisLine = String.format("%s\t%s\t%s\t%s", pdisease.getDiseaseId(), pdisease.getLabel(), promptFileName, f.getAbsolutePath());
try {
diagnosisList.add(diagnosisLine);
String prompt = generator.createPrompt(individual);
outputPrompt(prompt, promptFileName, dirpath);
} catch (Exception e) {
e.printStackTrace();
System.err.printf("[ERROR] Could not process %s: %s\n", promptFileName, e.getMessage());
//e.printStackTrace();
}
}
Set<String> missing = generator.getMissingTranslations();
if (! missing.isEmpty()) {
for (var m : missing) {
System.out.printf("[%s] Missing: %s\n", languageCode, m);
}
}
}
Expand All @@ -117,17 +165,17 @@ private void outputPromptsInternational(List<File> ppktFiles, Ontology hpo, Stri
private List<CorrectResult> outputPromptsEnglish(List<File> ppktFiles, Ontology hpo) {
createDir("prompts/en");
List<CorrectResult> correctResultList = new ArrayList<>();
PromptGenerator generator = PromptGenerator.english(hpo);
PromptGenerator generator = PromptGenerator.english();

for (var f: ppktFiles) {
PpktIndividual individual = new PpktIndividual(f);
PpktIndividual individual = PpktIndividual.fromFile(f);
List<PhenopacketDisease> diseaseList = individual.getDiseases();
if (diseaseList.size() != 1) {
System.err.println(String.format("[ERROR] Got %d diseases for %s.\n", diseaseList.size(), individual.getPhenopacketId()));
System.err.printf("[ERROR] Got %d diseases for %s.\n", diseaseList.size(), individual.getPhenopacketId());
continue;
}
PhenopacketDisease pdisease = diseaseList.get(0);
String promptFileName = getFileName( individual.getPhenopacketId());
String promptFileName = getFileName( individual.getPhenopacketId(), "en");
String diagnosisLine = String.format("%s\t%s\t%s\t%s", pdisease.getDiseaseId(), pdisease.getLabel(), promptFileName, f.getAbsolutePath());
try {
String prompt = generator.createPrompt(individual);
Expand All @@ -150,7 +198,8 @@ private void outputPrompt(String prompt, String promptFileName, String dir) {
} catch (IOException e) {
e.printStackTrace();
}
System.out.print(".");
System.out.printf("%s %d.\r", currentLanguageCode, currentCount);
currentCount++;
}


Expand All @@ -177,6 +226,9 @@ private List<File> getAllPhenopacketJsonFiles() {
for (File item : items) {
if (item.isDirectory())
ppktDirectories.add(ppktDir+item.getName());
else if (item.isFile() && item.getName().endsWith(".json")) {
ppktFiles.add(item);
}
}
for (var f: ppktDirectories) {
File subdir = new File(f);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ public class GptTranslateCommand implements Callable<Integer> {
@CommandLine.Option(names = {"-p", "--ppkt"}, description = "Path to JSON phenopacket file", required = true)
private String ppkt;

@CommandLine.Option(names = {"-l", "--language"}, description = "Language code", defaultValue = "de")
private String languageCode;


@Override
public Integer call() throws Exception {
Expand All @@ -54,13 +57,31 @@ public Integer call() throws Exception {


System.out.println(hpo.version().orElse("n/a"));
PromptGenerator generator = PromptGenerator.english(hpo);
PpktIndividual individual = new PpktIndividual(new File(ppkt));
PromptGenerator generator = PromptGenerator.english();
PpktIndividual individual = PpktIndividual.fromFile(new File(ppkt));
String prompt = generator.createPrompt(individual);
System.out.println(prompt);
System.out.println("SPANISH");
PromptGenerator spanish = PromptGenerator.spanish(hpo, internationalMap.get("es"));
prompt = spanish.createPrompt(individual);
switch (languageCode) {
case "de" -> {
PromptGenerator german = PromptGenerator.german(internationalMap.get("de"));
prompt = german.createPrompt(individual);
}
case "es" -> {
PromptGenerator spanish = PromptGenerator.spanish(internationalMap.get("es"));
prompt = spanish.createPrompt(individual);
}
case "nl" -> {
PromptGenerator dutch = PromptGenerator.dutch(internationalMap.get("nl"));
prompt = dutch.createPrompt(individual);
}
case "it" -> {
PromptGenerator italian = PromptGenerator.italian(internationalMap.get("it"));
prompt = italian.createPrompt(individual);
}
default -> prompt = "did not recognize language code " + languageCode;
}


System.out.println(prompt);

return 0;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ public class HpInternationalOboParser {
* @return in this case "tr"
*/
public Optional<String> getLanguage(String annots) {
final String translation = "translation:language=\"(\\w{2,2})\"";
final String translation = "translation:language=\"(\\w{2,3})\"";
final Pattern pattern = Pattern.compile(translation);
Matcher matcher = pattern.matcher(annots);
if (matcher.find()) {
Expand All @@ -53,7 +53,7 @@ public Optional<String> getTranslation(String annots) {
public HpInternationalOboParser(File file) {
languageToInternationalMap = new HashMap<>();
String pattern = "id: (HP:\\d{7,7})";
Set<String> acronyms = Set.of("cs", "en", "es", "fr", "ja", "nl", "nna", "tr", "tw", "zh");
Set<String> acronyms = Set.of("cs", "en", "de", "dtp", "it", "es", "fr", "ja", "nl", "nna", "tr", "tw", "zh");
for (String acronym : acronyms) {
languageToInternationalMap.put(acronym, new HpInternational(acronym));
}
Expand Down Expand Up @@ -84,9 +84,13 @@ public HpInternationalOboParser(File file) {
Optional<String> opt = getLanguage(annots);
if (opt.isPresent()) {
String language = opt.get();
if (! languageToInternationalMap.containsKey(language)) {
System.err.println("[ERROR] Could not find language \"" + language + "\"");
continue;
}
languageToInternationalMap.get(language).addTerm(currentHpoTermId, hpoLabel);
} else {
System.err.printf("[ERROR] Could not extract language for %s.", line);
System.err.printf("[ERROR] Could not extract language for %s.\n", line);
}
}

Expand All @@ -98,13 +102,13 @@ public HpInternationalOboParser(File file) {
} catch (IOException e) {
e.printStackTrace();
}
for (String language : languageToInternationalMap.keySet()) {
/*for (String language : languageToInternationalMap.keySet()) {
System.out.println(language);
HpInternational international = languageToInternationalMap.get(language);
for (var entry : international.getTermIdToLabelMap().entrySet()) {
System.out.printf("\t%s: %s\n", entry.getKey().getValue(), entry.getValue());
}
}
}*/
}

public Map<String, HpInternational> getLanguageToInternationalMap() {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package org.monarchinitiative.phenopacket2prompt.model;

public class AgeNotSpecified implements PhenopacketAge {
import java.util.Objects;

public final class AgeNotSpecified implements PhenopacketAge {
@Override
public String age() {
return "";
Expand Down Expand Up @@ -43,4 +45,18 @@ public int totalDays() {

@Override
public boolean specified() {return false; }

@Override
public int hashCode() {
return Objects.hashCode(totalDays());
}

@Override
public boolean equals(Object obj) {
if (! (obj instanceof PhenopacketAge)) return false;
PhenopacketAge iso = (PhenopacketAge) obj;
return iso.totalDays() == totalDays();
}


}
Loading

0 comments on commit 8c513ae

Please sign in to comment.