Skip to content

Commit

Permalink
code works
Browse files Browse the repository at this point in the history
  • Loading branch information
Xu, Qinying (H&B, Herston) authored and Xu, Qinying (H&B, Herston) committed Apr 4, 2024
1 parent 7b6df96 commit da88fa9
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 8 deletions.
11 changes: 5 additions & 6 deletions src/main/java/au/csiro/variantspark/utils/FileUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -12,23 +12,22 @@ public class FileUtils {
* @param file: an input file
* @return true if input file is BGZIP by check the first two byte of input file
*/
public static boolean isInputBGZ(final File file) {

public static boolean isBGZFile(String filePath) {
/**
* .vcf.bgz is type of GZP file
* .vcf.bgz is type of GZP file, work well with BlockCompressedInputStream
* .vcf.gz is also GZP file but get java.lang.OutOfMemoryError at java.io.InputStreamReader.read(InputStreamReader.java:184)
* .vcf.bz2 is not GZP file and get java.lang.OutOfMemoryError at java.io.InputStreamReader.read(InputStreamReader.java:184)
* .vcf is not GZP file and get htsjdk.samtools.SAMFormatException: at header from java.io.BufferedReader.readLine(BufferedReader.java:389)
*/
try (BufferedInputStream bufferedInputStream = new BufferedInputStream(new FileInputStream(file))) {
*/
try (BufferedInputStream bufferedInputStream = new BufferedInputStream(new FileInputStream(filePath))) {
bufferedInputStream.mark(100); // mark the current position
boolean isValid = BlockCompressedInputStream.isValidFile(bufferedInputStream);
bufferedInputStream.reset(); // reset back to the marked position
return isValid;
} catch (IOException e) {
// Handle the exception
return false;
}
}
}

/**
Expand Down
4 changes: 2 additions & 2 deletions src/main/scala/au/csiro/variantspark/cli/args/SparkArgs.scala
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@ trait SparkArgs extends SparkApp {
val sparkPar: Int = 0

def textFile(inputFile: String): RDD[String] = {
val isBGZ = FileUtils.isInputBGZ(new File(inputFile))
val isBGZ = FileUtils.isBGZFile(inputFile)
println(inputFile + " is loading to spark RDD, isBGZFile: " + isBGZ)
if (isBGZ ) {
if (isBGZ) {
val path = new Path(inputFile)
val fs = path.getFileSystem(sc.hadoopConfiguration)
val bgzInputStream = new BlockCompressedInputStream(fs.open(path))
Expand Down

0 comments on commit da88fa9

Please sign in to comment.