diff --git a/src/main/java/au/csiro/variantspark/utils/FileUtils.java b/src/main/java/au/csiro/variantspark/utils/FileUtils.java index e90e68e7..20ca1e48 100644 --- a/src/main/java/au/csiro/variantspark/utils/FileUtils.java +++ b/src/main/java/au/csiro/variantspark/utils/FileUtils.java @@ -12,15 +12,14 @@ public class FileUtils { * @param file: an input file * @return true if input file is BGZIP by check the first two byte of input file */ - public static boolean isInputBGZ(final File file) { - + public static boolean isBGZFile(String filePath) { /** - * .vcf.bgz is type of GZP file + * .vcf.bgz is type of GZP file, work well with BlockCompressedInputStream * .vcf.gz is also GZP file but get java.lang.OutOfMemoryError at java.io.InputStreamReader.read(InputStreamReader.java:184) * .vcf.bz2 is not GZP file and get java.lang.OutOfMemoryError at java.io.InputStreamReader.read(InputStreamReader.java:184) * .vcf is not GZP file and get htsjdk.samtools.SAMFormatException: at header from java.io.BufferedReader.readLine(BufferedReader.java:389) - */ - try (BufferedInputStream bufferedInputStream = new BufferedInputStream(new FileInputStream(file))) { + */ + try (BufferedInputStream bufferedInputStream = new BufferedInputStream(new FileInputStream(filePath))) { bufferedInputStream.mark(100); // mark the current position boolean isValid = BlockCompressedInputStream.isValidFile(bufferedInputStream); bufferedInputStream.reset(); // reset back to the marked position @@ -28,7 +27,7 @@ public static boolean isInputBGZ(final File file) { } catch (IOException e) { // Handle the exception return false; - } + } } /** diff --git a/src/main/scala/au/csiro/variantspark/cli/args/SparkArgs.scala b/src/main/scala/au/csiro/variantspark/cli/args/SparkArgs.scala index ade9f4f5..6e72a20a 100644 --- a/src/main/scala/au/csiro/variantspark/cli/args/SparkArgs.scala +++ b/src/main/scala/au/csiro/variantspark/cli/args/SparkArgs.scala @@ -15,9 +15,9 @@ trait SparkArgs extends SparkApp { val sparkPar: Int = 0 def textFile(inputFile: String): RDD[String] = { - val isBGZ = FileUtils.isInputBGZ(new File(inputFile)) + val isBGZ = FileUtils.isBGZFile(inputFile) println(inputFile + " is loading to spark RDD, isBGZFile: " + isBGZ) - if (isBGZ ) { + if (isBGZ) { val path = new Path(inputFile) val fs = path.getFileSystem(sc.hadoopConfiguration) val bgzInputStream = new BlockCompressedInputStream(fs.open(path))