Skip to content

Commit

Permalink
Merge pull request #27 from cmap/chunking_and_compression
Browse files Browse the repository at this point in the history
Chunking and compression
  • Loading branch information
oena authored Jan 11, 2018
2 parents fef095f + 8f2f4a4 commit d3a3690
Show file tree
Hide file tree
Showing 5 changed files with 100 additions and 23 deletions.
3 changes: 0 additions & 3 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,6 @@ language: python
# python versioning
python:
- 2.7
- 3.4
- 3.5
- 3.6

# requirements
install:
Expand Down
39 changes: 35 additions & 4 deletions cmapPy/pandasGEXpress/tests/test_write_gctx.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,37 @@ def test_write_version(self):
self.assertEqual(hdf5_v2, write_gctx.version_number)
os.remove(fn)

def test_calculate_elem_per_kb(self):
max_chunk_kb = 1024

# dtype is numpy.float32
dtype1 = numpy.float32
correct_elem_per_kb1 = 256
elem_per_kb1 = write_gctx.calculate_elem_per_kb(max_chunk_kb, dtype1)
self.assertEqual(elem_per_kb1, correct_elem_per_kb1)

# dtype is numpy.float64
dtype2 = numpy.float64
correct_elem_per_kb2 = 128
elem_per_kb2 = write_gctx.calculate_elem_per_kb(max_chunk_kb, dtype2)
self.assertEqual(elem_per_kb2, correct_elem_per_kb2)

# dtype is somethign else
dtype3 = numpy.int
with self.assertRaises(Exception) as context:
write_gctx.calculate_elem_per_kb(max_chunk_kb, dtype3)
self.assertTrue("only numpy.float32 and numpy.float64 are currently supported" in str(context.exception))


def test_set_data_matrix_chunk_size(self):
max_chunk_kb = 1024
elem_per_kb = 256
sample_data_shape = (978, 1000)
expected_chunk_size = (978, 268)
calculated_chunk_size = write_gctx.set_data_matrix_chunk_size(sample_data_shape, max_chunk_kb, elem_per_kb)
self.assertEqual(calculated_chunk_size, expected_chunk_size)


def test_write_metadata(self):
"""
CASE 1:
Expand All @@ -87,8 +118,8 @@ def test_write_metadata(self):
"""
mini_gctoo = mini_gctoo_for_testing.make(convert_neg_666=False)
hdf5_writer = h5py.File(FUNCTIONAL_TESTS_PATH + "/mini_gctoo_metadata.gctx", "w")
write_gctx.write_metadata(hdf5_writer, "row", mini_gctoo.row_metadata_df, False)
write_gctx.write_metadata(hdf5_writer, "col", mini_gctoo.col_metadata_df, False)
write_gctx.write_metadata(hdf5_writer, "row", mini_gctoo.row_metadata_df, False, 6)
write_gctx.write_metadata(hdf5_writer, "col", mini_gctoo.col_metadata_df, False, 6)
hdf5_writer.close()
logger.debug("Wrote mini_gctoo_metadata.gctx to {}".format(
os.path.join(FUNCTIONAL_TESTS_PATH, "mini_gctoo_metadata.gctx")))
Expand Down Expand Up @@ -142,8 +173,8 @@ def test_write_metadata(self):
# write row and col metadata fields from mini_gctoo_for_testing instance to file
# Note this time does convert back to -666
hdf5_writer = h5py.File(FUNCTIONAL_TESTS_PATH + "/mini_gctoo_metadata.gctx", "w")
write_gctx.write_metadata(hdf5_writer, "row", converted_row_metadata, True)
write_gctx.write_metadata(hdf5_writer, "col", converted_col_metadata, True)
write_gctx.write_metadata(hdf5_writer, "row", converted_row_metadata, True, 6)
write_gctx.write_metadata(hdf5_writer, "col", converted_col_metadata, True, 6)
hdf5_writer.close()

# read in written metadata, then close and delete file
Expand Down
69 changes: 61 additions & 8 deletions cmapPy/pandasGEXpress/write_gctx.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@
version_number = "GCTX1.0"


def write_gctx(gctoo_object, out_file_name, convert_back_to_neg_666=True):
def write_gctx(gctoo_object, out_file_name, convert_back_to_neg_666=True, gzip_compression_level=6,
max_chunk_kb=1024):
"""
Essentially the same as write() method; enables user to call write_gctx() from
cmapPy instead of write_gctx.write()
Expand All @@ -28,13 +29,18 @@ def write_gctx(gctoo_object, out_file_name, convert_back_to_neg_666=True):
write(gctoo_object, out_file_name, convert_back_to_neg_666)


def write(gctoo_object, out_file_name, convert_back_to_neg_666=True):
def write(gctoo_object, out_file_name, convert_back_to_neg_666=True, gzip_compression_level=6,
max_chunk_kb=1024, matrix_dtype=numpy.float32):
"""
Writes a GCToo instance to specified file.
Input:
- gctoo_object (GCToo): A GCToo instance.
- out_file_name (str): file name to write gctoo_object to.
- convert_back_to_neg_666 (bool): whether to convert np.NAN in metadata back to "-666"
- gzip_compression_level (int, default=6): Compression level to use for metadata.
- max_chunk_kb (int, default=1024): The maximum number of KB a given chunk will occupy
- matrix_dtype (numpy dtype, default=numpy.float32): Storage data type for data matrix.
"""
# make sure out file has a .gctx suffix
gctx_out_name = add_gctx_to_out_name(out_file_name)
Expand All @@ -48,14 +54,21 @@ def write(gctoo_object, out_file_name, convert_back_to_neg_666=True):
# write src
write_src(hdf5_out, gctoo_object, gctx_out_name)

# set chunk size for data matrix
elem_per_kb = calculate_elem_per_kb(max_chunk_kb, matrix_dtype)
chunk_size = set_data_matrix_chunk_size(gctoo_object.data_df.shape, max_chunk_kb, elem_per_kb)

# write data matrix
hdf5_out.create_dataset(data_matrix_node, data=gctoo_object.data_df.transpose().as_matrix())
hdf5_out.create_dataset(data_matrix_node, data=gctoo_object.data_df.transpose().as_matrix(),
dtype=matrix_dtype)

# write col metadata
write_metadata(hdf5_out, "col", gctoo_object.col_metadata_df, convert_back_to_neg_666)
write_metadata(hdf5_out, "col", gctoo_object.col_metadata_df, convert_back_to_neg_666,
gzip_compression=gzip_compression_level)

# write row metadata
write_metadata(hdf5_out, "row", gctoo_object.row_metadata_df, convert_back_to_neg_666)
write_metadata(hdf5_out, "row", gctoo_object.row_metadata_df, convert_back_to_neg_666,
gzip_compression=gzip_compression_level)

# close gctx file
hdf5_out.close()
Expand Down Expand Up @@ -101,8 +114,46 @@ def write_version(hdf5_out):
"""
hdf5_out.attrs[version_attr] = numpy.string_(version_number)

def calculate_elem_per_kb(max_chunk_kb, matrix_dtype):
"""
Calculates the number of elem per kb depending on the max chunk size set.
Input:
- max_chunk_kb (int, default=1024): The maximum number of KB a given chunk will occupy
- matrix_dtype (numpy dtype, default=numpy.float32): Storage data type for data matrix.
Currently needs to be np.float32 or np.float64 (TODO: figure out a better way to get bits from a numpy dtype).
Returns:
elem_per_kb (int), the number of elements per kb for matrix dtype specified.
"""
if matrix_dtype == numpy.float32:
return (max_chunk_kb * 8)/32
elif matrix_dtype == numpy.float64:
return (max_chunk_kb * 8)/64
else:
msg = "Invalid matrix_dtype: {}; only numpy.float32 and numpy.float64 are currently supported".format(matrix_dtype)
logger.error(msg)
raise Exception("write_gctx.calculate_elem_per_kb " + msg)

def write_metadata(hdf5_out, dim, metadata_df, convert_back_to_neg_666):

def set_data_matrix_chunk_size(df_shape, max_chunk_kb, elem_per_kb):
"""
Sets chunk size to use for writing data matrix.
Note. Calculation used here is for compatibility with cmapM and cmapR.
Input:
- df_shape (tuple): shape of input data_df.
- max_chunk_kb (int, default=1024): The maximum number of KB a given chunk will occupy
- elem_per_kb (int): Number of elements per kb
Returns:
chunk size (tuple) to use for chunking the data matrix
"""
row_chunk_size = min(df_shape[0], 1000)
col_chunk_size = min(((max_chunk_kb*elem_per_kb)//row_chunk_size), df_shape[1])
return (row_chunk_size, col_chunk_size)

def write_metadata(hdf5_out, dim, metadata_df, convert_back_to_neg_666, gzip_compression):
"""
Writes either column or row metadata to proper node of gctx out (hdf5) file.
Expand All @@ -123,7 +174,8 @@ def write_metadata(hdf5_out, dim, metadata_df, convert_back_to_neg_666):
logger.error("'dim' argument must be either 'row' or 'col'!")

# write id field to expected node
hdf5_out.create_dataset(metadata_node_name + "/id", data=[str(x) for x in metadata_df.index])
hdf5_out.create_dataset(metadata_node_name + "/id", data=[str(x) for x in metadata_df.index],
compression=gzip_compression)

metadata_fields = list(metadata_df.columns.copy())

Expand All @@ -135,4 +187,5 @@ def write_metadata(hdf5_out, dim, metadata_df, convert_back_to_neg_666):
# write metadata columns to their own arrays
for field in [entry for entry in metadata_fields if entry != "ind"]:
hdf5_out.create_dataset(metadata_node_name + "/" + field,
data=numpy.array(list(metadata_df.loc[:, field])))
data=numpy.array(list(metadata_df.loc[:, field])),
compression=gzip_compression)
4 changes: 2 additions & 2 deletions setup.cfg
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
[bdist_wheel]
# Only Python 2.7 supported; some versions of Python 3 support as well
universal=1
# Only Python 2.7 supported
universal=0
8 changes: 2 additions & 6 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# Versions should comply with PEP440. For a discussion on single-sourcing
# the version across setup.py and the project code, see
# https://packaging.python.org/en/latest/single_source_version.html
version='2.2.0',
version='3.0.0',

description='Assorted tools for interacting with .gct, .gctx files and other Connectivity Map (Broad Institute) data/tools',
long_description="cmapPy: Tools for interacting with .gctx and .gct files, and other Connectivity Map resources. See our documentation at http://cmappy.readthedocs.io/en/latest/, and for more information on the file formats and available resources, please see clue.io/gctx.",
Expand Down Expand Up @@ -45,11 +45,7 @@
# Specify the Python versions you support here. In particular, ensure
# that you indicate whether you support Python 2, Python 3 or both.
'Programming Language :: Python :: 2',
'Programming Language :: Python :: 2.7',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.4',
'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 2.7'
],

# What does your project relate to?
Expand Down

0 comments on commit d3a3690

Please sign in to comment.