diff --git a/.travis.yml b/.travis.yml index 6301ef9..a2e2e6f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,9 +3,6 @@ language: python # python versioning python: - 2.7 - - 3.4 - - 3.5 - - 3.6 # requirements install: diff --git a/cmapPy/pandasGEXpress/tests/test_write_gctx.py b/cmapPy/pandasGEXpress/tests/test_write_gctx.py index ead687c..fad9454 100644 --- a/cmapPy/pandasGEXpress/tests/test_write_gctx.py +++ b/cmapPy/pandasGEXpress/tests/test_write_gctx.py @@ -79,6 +79,37 @@ def test_write_version(self): self.assertEqual(hdf5_v2, write_gctx.version_number) os.remove(fn) + def test_calculate_elem_per_kb(self): + max_chunk_kb = 1024 + + # dtype is numpy.float32 + dtype1 = numpy.float32 + correct_elem_per_kb1 = 256 + elem_per_kb1 = write_gctx.calculate_elem_per_kb(max_chunk_kb, dtype1) + self.assertEqual(elem_per_kb1, correct_elem_per_kb1) + + # dtype is numpy.float64 + dtype2 = numpy.float64 + correct_elem_per_kb2 = 128 + elem_per_kb2 = write_gctx.calculate_elem_per_kb(max_chunk_kb, dtype2) + self.assertEqual(elem_per_kb2, correct_elem_per_kb2) + + # dtype is somethign else + dtype3 = numpy.int + with self.assertRaises(Exception) as context: + write_gctx.calculate_elem_per_kb(max_chunk_kb, dtype3) + self.assertTrue("only numpy.float32 and numpy.float64 are currently supported" in str(context.exception)) + + + def test_set_data_matrix_chunk_size(self): + max_chunk_kb = 1024 + elem_per_kb = 256 + sample_data_shape = (978, 1000) + expected_chunk_size = (978, 268) + calculated_chunk_size = write_gctx.set_data_matrix_chunk_size(sample_data_shape, max_chunk_kb, elem_per_kb) + self.assertEqual(calculated_chunk_size, expected_chunk_size) + + def test_write_metadata(self): """ CASE 1: @@ -87,8 +118,8 @@ def test_write_metadata(self): """ mini_gctoo = mini_gctoo_for_testing.make(convert_neg_666=False) hdf5_writer = h5py.File(FUNCTIONAL_TESTS_PATH + "/mini_gctoo_metadata.gctx", "w") - write_gctx.write_metadata(hdf5_writer, "row", mini_gctoo.row_metadata_df, False) - write_gctx.write_metadata(hdf5_writer, "col", mini_gctoo.col_metadata_df, False) + write_gctx.write_metadata(hdf5_writer, "row", mini_gctoo.row_metadata_df, False, 6) + write_gctx.write_metadata(hdf5_writer, "col", mini_gctoo.col_metadata_df, False, 6) hdf5_writer.close() logger.debug("Wrote mini_gctoo_metadata.gctx to {}".format( os.path.join(FUNCTIONAL_TESTS_PATH, "mini_gctoo_metadata.gctx"))) @@ -142,8 +173,8 @@ def test_write_metadata(self): # write row and col metadata fields from mini_gctoo_for_testing instance to file # Note this time does convert back to -666 hdf5_writer = h5py.File(FUNCTIONAL_TESTS_PATH + "/mini_gctoo_metadata.gctx", "w") - write_gctx.write_metadata(hdf5_writer, "row", converted_row_metadata, True) - write_gctx.write_metadata(hdf5_writer, "col", converted_col_metadata, True) + write_gctx.write_metadata(hdf5_writer, "row", converted_row_metadata, True, 6) + write_gctx.write_metadata(hdf5_writer, "col", converted_col_metadata, True, 6) hdf5_writer.close() # read in written metadata, then close and delete file diff --git a/cmapPy/pandasGEXpress/write_gctx.py b/cmapPy/pandasGEXpress/write_gctx.py index 98c31cc..2e9064c 100644 --- a/cmapPy/pandasGEXpress/write_gctx.py +++ b/cmapPy/pandasGEXpress/write_gctx.py @@ -18,7 +18,8 @@ version_number = "GCTX1.0" -def write_gctx(gctoo_object, out_file_name, convert_back_to_neg_666=True): +def write_gctx(gctoo_object, out_file_name, convert_back_to_neg_666=True, gzip_compression_level=6, + max_chunk_kb=1024): """ Essentially the same as write() method; enables user to call write_gctx() from cmapPy instead of write_gctx.write() @@ -28,13 +29,18 @@ def write_gctx(gctoo_object, out_file_name, convert_back_to_neg_666=True): write(gctoo_object, out_file_name, convert_back_to_neg_666) -def write(gctoo_object, out_file_name, convert_back_to_neg_666=True): +def write(gctoo_object, out_file_name, convert_back_to_neg_666=True, gzip_compression_level=6, + max_chunk_kb=1024, matrix_dtype=numpy.float32): """ Writes a GCToo instance to specified file. Input: - gctoo_object (GCToo): A GCToo instance. - out_file_name (str): file name to write gctoo_object to. + - convert_back_to_neg_666 (bool): whether to convert np.NAN in metadata back to "-666" + - gzip_compression_level (int, default=6): Compression level to use for metadata. + - max_chunk_kb (int, default=1024): The maximum number of KB a given chunk will occupy + - matrix_dtype (numpy dtype, default=numpy.float32): Storage data type for data matrix. """ # make sure out file has a .gctx suffix gctx_out_name = add_gctx_to_out_name(out_file_name) @@ -48,14 +54,21 @@ def write(gctoo_object, out_file_name, convert_back_to_neg_666=True): # write src write_src(hdf5_out, gctoo_object, gctx_out_name) + # set chunk size for data matrix + elem_per_kb = calculate_elem_per_kb(max_chunk_kb, matrix_dtype) + chunk_size = set_data_matrix_chunk_size(gctoo_object.data_df.shape, max_chunk_kb, elem_per_kb) + # write data matrix - hdf5_out.create_dataset(data_matrix_node, data=gctoo_object.data_df.transpose().as_matrix()) + hdf5_out.create_dataset(data_matrix_node, data=gctoo_object.data_df.transpose().as_matrix(), + dtype=matrix_dtype) # write col metadata - write_metadata(hdf5_out, "col", gctoo_object.col_metadata_df, convert_back_to_neg_666) + write_metadata(hdf5_out, "col", gctoo_object.col_metadata_df, convert_back_to_neg_666, + gzip_compression=gzip_compression_level) # write row metadata - write_metadata(hdf5_out, "row", gctoo_object.row_metadata_df, convert_back_to_neg_666) + write_metadata(hdf5_out, "row", gctoo_object.row_metadata_df, convert_back_to_neg_666, + gzip_compression=gzip_compression_level) # close gctx file hdf5_out.close() @@ -101,8 +114,46 @@ def write_version(hdf5_out): """ hdf5_out.attrs[version_attr] = numpy.string_(version_number) +def calculate_elem_per_kb(max_chunk_kb, matrix_dtype): + """ + Calculates the number of elem per kb depending on the max chunk size set. + + Input: + - max_chunk_kb (int, default=1024): The maximum number of KB a given chunk will occupy + - matrix_dtype (numpy dtype, default=numpy.float32): Storage data type for data matrix. + Currently needs to be np.float32 or np.float64 (TODO: figure out a better way to get bits from a numpy dtype). + + Returns: + elem_per_kb (int), the number of elements per kb for matrix dtype specified. + """ + if matrix_dtype == numpy.float32: + return (max_chunk_kb * 8)/32 + elif matrix_dtype == numpy.float64: + return (max_chunk_kb * 8)/64 + else: + msg = "Invalid matrix_dtype: {}; only numpy.float32 and numpy.float64 are currently supported".format(matrix_dtype) + logger.error(msg) + raise Exception("write_gctx.calculate_elem_per_kb " + msg) -def write_metadata(hdf5_out, dim, metadata_df, convert_back_to_neg_666): + +def set_data_matrix_chunk_size(df_shape, max_chunk_kb, elem_per_kb): + """ + Sets chunk size to use for writing data matrix. + Note. Calculation used here is for compatibility with cmapM and cmapR. + + Input: + - df_shape (tuple): shape of input data_df. + - max_chunk_kb (int, default=1024): The maximum number of KB a given chunk will occupy + - elem_per_kb (int): Number of elements per kb + + Returns: + chunk size (tuple) to use for chunking the data matrix + """ + row_chunk_size = min(df_shape[0], 1000) + col_chunk_size = min(((max_chunk_kb*elem_per_kb)//row_chunk_size), df_shape[1]) + return (row_chunk_size, col_chunk_size) + +def write_metadata(hdf5_out, dim, metadata_df, convert_back_to_neg_666, gzip_compression): """ Writes either column or row metadata to proper node of gctx out (hdf5) file. @@ -123,7 +174,8 @@ def write_metadata(hdf5_out, dim, metadata_df, convert_back_to_neg_666): logger.error("'dim' argument must be either 'row' or 'col'!") # write id field to expected node - hdf5_out.create_dataset(metadata_node_name + "/id", data=[str(x) for x in metadata_df.index]) + hdf5_out.create_dataset(metadata_node_name + "/id", data=[str(x) for x in metadata_df.index], + compression=gzip_compression) metadata_fields = list(metadata_df.columns.copy()) @@ -135,4 +187,5 @@ def write_metadata(hdf5_out, dim, metadata_df, convert_back_to_neg_666): # write metadata columns to their own arrays for field in [entry for entry in metadata_fields if entry != "ind"]: hdf5_out.create_dataset(metadata_node_name + "/" + field, - data=numpy.array(list(metadata_df.loc[:, field]))) + data=numpy.array(list(metadata_df.loc[:, field])), + compression=gzip_compression) diff --git a/setup.cfg b/setup.cfg index bb32faf..7019a02 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,3 +1,3 @@ [bdist_wheel] -# Only Python 2.7 supported; some versions of Python 3 support as well -universal=1 \ No newline at end of file +# Only Python 2.7 supported +universal=0 diff --git a/setup.py b/setup.py index e9f7f42..03cc9a9 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,7 @@ # Versions should comply with PEP440. For a discussion on single-sourcing # the version across setup.py and the project code, see # https://packaging.python.org/en/latest/single_source_version.html - version='2.2.0', + version='3.0.0', description='Assorted tools for interacting with .gct, .gctx files and other Connectivity Map (Broad Institute) data/tools', long_description="cmapPy: Tools for interacting with .gctx and .gct files, and other Connectivity Map resources. See our documentation at http://cmappy.readthedocs.io/en/latest/, and for more information on the file formats and available resources, please see clue.io/gctx.", @@ -45,11 +45,7 @@ # Specify the Python versions you support here. In particular, ensure # that you indicate whether you support Python 2, Python 3 or both. 'Programming Language :: Python :: 2', - 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.4', - 'Programming Language :: Python :: 3.5', - 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 2.7' ], # What does your project relate to?