Skip to content

Commit

Permalink
Merge branch 'release/v2.2.8'
Browse files Browse the repository at this point in the history
  • Loading branch information
gmarcais committed Feb 8, 2018
2 parents 2cff4e4 + 870426c commit 68c9cb2
Show file tree
Hide file tree
Showing 26 changed files with 266 additions and 100 deletions.
4 changes: 3 additions & 1 deletion Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,8 @@ AM_SH_LOG_FLAGS =
TESTS = tests/generate_sequence.sh tests/parallel_hashing.sh \
tests/merge.sh tests/bloom_filter.sh tests/big.sh \
tests/subset_hashing.sh tests/multi_file.sh \
tests/bloom_counter.sh tests/large_key.sh tests/sam.sh
tests/bloom_counter.sh tests/large_key.sh tests/sam.sh \
tests/small_mers.sh

EXTRA_DIST += $(TESTS)
clean-local: clean-local-check
Expand All @@ -164,6 +165,7 @@ tests/min_qual.log: tests/generate_fastq_sequence.log
tests/large_key.log: tests/generate_sequence.log
tests/quality_filter.log: tests/generate_sequence.log
tests/sam.log: tests/generate_sequence.log
tests/small_mers.log: tests/generate_sequence.log

# SWIG tests
TESTS += tests/swig_python.sh tests/swig_ruby.sh tests/swig_perl.sh
Expand Down
19 changes: 13 additions & 6 deletions configure.ac
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
AC_INIT([jellyfish], [2.2.7], [gmarcais@umd.edu])
AC_INIT([jellyfish], [2.2.8], [gmarcais@umd.edu])
AC_CANONICAL_HOST
AC_CONFIG_MACRO_DIR([m4])
AM_INIT_AUTOMAKE([subdir-objects foreign parallel-tests color-tests])
Expand All @@ -10,18 +10,23 @@ AC_LIB_RPATH
PKG_PROG_PKG_CONFIG

# Change default compilation flags
AC_SUBST([ALL_CXXFLAGS], [-std=c++0x])
CXXFLAGS="-std=c++0x $CXXFLAGS"
AC_LANG(C++)
AC_PROG_CXX

# Major version of the library
AC_SUBST([PACKAGE_LIB], [2.0])

# Check if gnu++11 is necessary
save_CXXFLAGS=$CXXFLAGS
AC_CANONICAL_HOST
case "${host_os}" in
cygwin*) CXXFLAGS="-std=gnu++11 $save_CXXFLAGS" ;;
*) CXXFLAGS="-std=c++11 $save_CXXFLAGS" ;;
esac

# Try to find htslib to read SAM/BAM/CRAM files
AC_ARG_ENABLE([htslib],
[AS_HELP_STRING([--enable-htslib], [Look for the HTS library (default=yes)])])
echo "enable_htslib $enable_htslib"
AS_IF([test "x$enable_htslib" = "xyes" -o "x$enable_htslib" = "x"],
[PKG_CHECK_MODULES([HTSLIB], [htslib], [AC_DEFINE([HAVE_HTSLIB], [1], [Defined if htslib is available])], [true])]
[AC_LIB_LINKFLAGS_FROM_LIBS([HTSLIB_RPATH], [$HTSLIB_LIBS], [LIBTOOL])])
Expand Down Expand Up @@ -88,8 +93,7 @@ AC_LINK_IFELSE([AC_LANG_PROGRAM([[#include <mach-o/dyld.h>]],
[AC_DEFINE([HAVE_NSGETEXECUTABLEPATH], [1], [Used to find executable path on MacOS X])],
[AC_MSG_RESULT([no])])

# Check the version of strerror_r
AC_CHECK_HEADERS_ONCE([execinfo.h ext/stdio_filebuf.h])
AC_CHECK_HEADERS_ONCE([execinfo.h ext/stdio_filebuf.h sys/syscall.h])
AC_CHECK_MEMBER([siginfo_t.si_int],
[AC_DEFINE([HAVE_SI_INT], [1], [Define if siginfo_t.si_int exists])],
[], [[#include <signal.h>]])
Expand Down Expand Up @@ -134,6 +138,9 @@ AM_CONDITIONAL(PYTHON_BINDING, [test -n "$enable_python_binding" -a x$enable_pyt
AM_COND_IF([PYTHON_BINDING],
[AS_IF([test x$enable_python_binding != xyes], [PYTHON_SITE_PKG=$enable_python_binding])]
[AX_PYTHON_DEVEL([], [$prefix])])
AC_ARG_ENABLE([python-deprecated],
[AC_HELP_STRING([--enable-python-deprecated], [enable the deprecated 'jellyfish' module (in addition to 'dna_jellyfish')])])
AM_CONDITIONAL([PYTHON_DEPRECATED], [test -z "$enable_python_deprecated" -o x$enable_python_deprecated != xno])

# Ruby binding setup
AS_IF([test -z "$enable_ruby_binding"], [enable_ruby_binding="$enable_all_binding"])
Expand Down
14 changes: 11 additions & 3 deletions include/jellyfish/file_header.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@ class file_header : public generic_file_header {
name += std::to_string((long long int)i); // Cast to make gcc4.4 happy!
const unsigned int r = root_[name]["r"].asUInt();
const unsigned int c = root_[name]["c"].asUInt();
if(root_[name]["identity"].asBool())
return RectangularBinaryMatrix::identity(r, c);

std::vector<uint64_t> raw(c, (uint64_t)0);
for(unsigned int i = 0; i < c; ++i)
raw[i] = root_[name]["columns"][i].asUInt64();
Expand All @@ -57,9 +60,14 @@ class file_header : public generic_file_header {
root_[name].clear();
root_[name]["r"] = m.r();
root_[name]["c"] = m.c();
for(unsigned int i = 0; i < m.c(); ++i) {
Json::UInt64 x = m[i];
root_[name]["columns"].append(x);
if(m.is_low_identity()) {
root_[name]["identity"] = true;
} else {
root_[name]["identity"] = false;
for(unsigned int i = 0; i < m.c(); ++i) {
Json::UInt64 x = m[i];
root_[name]["columns"].append(x);
}
}
}

Expand Down
23 changes: 13 additions & 10 deletions include/jellyfish/hash_counter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -104,16 +104,16 @@ class hash_counter {

while(!ary_->add(k, v, &carry_shift, is_new_ptr, id_ptr)) {
handle_full_ary();
v &= ~(uint64_t)0 << carry_shift;
v &= ~(uint64_t)0 << carry_shift;
// If carry_shift == 0, failed to allocate the first field for
// key, hence status of is_new and value for id are not
// determined yet. On the other hand, if carry_shift > 0, we
// failed while adding extra field for large key, so the status
// of is_new and value of id are known. We do not update them in future
// calls.
if(carry_shift) {
is_new_ptr = &is_new_void;
id_ptr = &id_void;
is_new_ptr = &is_new_void;
id_ptr = &id_void;
}
}
}
Expand Down Expand Up @@ -204,9 +204,16 @@ class hash_counter {
bool double_size(bool serial_thread) {
if(serial_thread) {// Allocate new array for size doubling
try {
new_ary_ = new array(ary_->size() * 2, ary_->key_len(), ary_->val_len(),
ary_->max_reprobe(), ary_->reprobes());
} catch(typename array::ErrorAllocation e) {
if(ary_->key_len() >= sizeof(size_t) * 8 || ary_->size() < ((size_t)1 << ary_->key_len())) {
// Increase number of keys
new_ary_ = new array(ary_->size() * 2, ary_->key_len(), ary_->val_len(),
ary_->max_reprobe(), ary_->reprobes());
} else {
// Array is already maximum compared to key len, increase val_len
new_ary_ = new array(ary_->size(), ary_->key_len(), ary_->val_len() + 1,
ary_->max_reprobe(), ary_->reprobes());
}
} catch(typename array::ErrorAllocation e) {
new_ary_ = 0;
}
}
Expand All @@ -219,10 +226,6 @@ class hash_counter {

// Copy data from old to new
uint16_t id = atomic_t::fetch_add(&size_thid_, (uint16_t)1);
// Why doesn't the following work? Seems like a bug to
// me. Equivalent call works in test_large_hash_array. Or am I
// missing something?
// eager_iterator it = ary_->iterator_slice<eager_iterator>(id, nb_threads_);
eager_iterator it = ary_->eager_slice(id, nb_threads_);
while(it.next())
my_ary->add(it.key(), it.val());
Expand Down
67 changes: 54 additions & 13 deletions include/jellyfish/large_hash_array.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -930,23 +930,35 @@ class array_base {

};

template<typename Key, typename word = uint64_t, typename atomic_t = ::atomic::gcc, typename mem_block_t = ::allocators::mmap>
class array :
// Large array. Memory managed by the mmap allocator. Do not check the
// relation between the size of the array and key_len.
template<typename Key, typename word = uint64_t,
typename atomic_t = ::atomic::gcc, typename mem_block_t = ::allocators::mmap>
class unbounded_array :
protected mem_block_t,
public array_base<Key, word, atomic_t, array<Key, word, atomic_t, mem_block_t> >
public array_base<Key, word, atomic_t, unbounded_array<Key, word, atomic_t, mem_block_t> >
{
typedef array_base<Key, word, atomic_t, array<Key, word, atomic_t, mem_block_t> > super;
friend class array_base<Key, word, atomic_t, array<Key, word, atomic_t, mem_block_t> >;
typedef array_base<Key, word, atomic_t, unbounded_array<Key, word, atomic_t, mem_block_t> > super;
friend class array_base<Key, word, atomic_t, unbounded_array<Key, word, atomic_t, mem_block_t> >;

public:
array(size_t size, // Size of hash. To be rounded up to a power of 2
uint16_t key_len, // Size of key in bits
uint16_t val_len, // Size of val in bits
uint16_t reprobe_limit, // Maximum reprobe
const size_t* reprobes = quadratic_reprobes) : // Reprobing policy
mem_block_t(),
super(size, key_len, val_len, reprobe_limit, RectangularBinaryMatrix(ceilLog2(size), key_len).randomize_pseudo_inverse(),
reprobes)
unbounded_array(size_t size, // Size of hash. To be rounded up to a power of 2
uint16_t key_len, // Size of key in bits
uint16_t val_len, // Size of val in bits
uint16_t reprobe_limit, // Maximum reprobe
const size_t* reprobes = quadratic_reprobes) // Reprobing policy
: super(size, key_len, val_len, reprobe_limit,
RectangularBinaryMatrix(ceilLog2(size), key_len).randomize_pseudo_inverse(),
reprobes)
{ }

unbounded_array(size_t size, // Size of hash. To be rounded up to a power of 2
uint16_t key_len, // Size of key in bits
uint16_t val_len, // Size of val in bits
uint16_t reprobe_limit, // Maximum reprobe
RectangularBinaryMatrix&& m, // Hashing matrix
const size_t* reprobes = quadratic_reprobes) // Reprobing policy
: super(size, key_len, val_len, reprobe_limit, m, reprobes)
{ }

protected:
Expand All @@ -956,6 +968,35 @@ class array :
}
};

// Large array. Memory managed by the mmap allocator, bound the size
// of the array if the key_len is small.
template<typename Key, typename word = uint64_t,
typename atomic_t = ::atomic::gcc, typename mem_block_t = ::allocators::mmap>
class array : public unbounded_array<Key, word, atomic_t, mem_block_t>
{
typedef unbounded_array<Key, word, atomic_t, mem_block_t> super;

static size_t key_len_size(uint16_t key_len) {
return key_len >= std::numeric_limits<size_t>::digits ? std::numeric_limits<size_t>::max() / 2 : (size_t)1 << key_len;
}

public:
array(size_t size, // Size of hash. To be rounded up to a power of 2
uint16_t key_len, // Size of key in bits
uint16_t val_len, // Size of val in bits
uint16_t reprobe_limit, // Maximum reprobe
const size_t* reprobes = quadratic_reprobes) : // Reprobing policy
super(std::min(size, key_len_size(key_len)), key_len, val_len, reprobe_limit,
(size < key_len_size(key_len))
? RectangularBinaryMatrix(ceilLog2(size), key_len).randomize_pseudo_inverse()
: RectangularBinaryMatrix::identity(key_len),
reprobes)
{
// std::cerr << this->size() << ' ' << this->val_len() << '\n';
}

};

struct ptr_info {
void* ptr_;
size_t bytes_;
Expand Down
2 changes: 1 addition & 1 deletion include/jellyfish/mer_overlap_sequence_parser.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ class mer_overlap_sequence_parser : public jellyfish::cooperative_pool2<mer_over
// streams_iterator_ noticed that we closed that stream before
// requesting a new one.
st.stream.reset();
st.stream = streams_iterator_.next();
st.stream = std::move(streams_iterator_.next());
if(!st.stream.good()) {
st.type = DONE_TYPE;
return false;
Expand Down
43 changes: 35 additions & 8 deletions include/jellyfish/rectangular_binary_matrix.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,19 +41,33 @@
// bits of each word are set to 0).
//
// Multiplication between a matrix and vector of size _c x 1 gives a
// vector of size _r x 1 stored as one 64 bit word.
// vector of size _r x 1 stored as one 64 bit word. A matrix with a
// NULL _columns pointer behaves like the identity.

namespace jellyfish {
class RectangularBinaryMatrix {
explicit RectangularBinaryMatrix(unsigned int c)
: _columns(NULL)
, _r(c)
, _c(c)
{ }

public:
RectangularBinaryMatrix(unsigned int r, unsigned c)
: _columns(alloc(r, c)), _r(r), _c(c) { }
RectangularBinaryMatrix(const RectangularBinaryMatrix &rhs)
: _columns(alloc(rhs._r, rhs._c)), _r(rhs._r), _c(rhs._c) {
memcpy(_columns, rhs._columns, sizeof(uint64_t) * _c);
: _columns(rhs._columns ? alloc(rhs._r, rhs._c) : NULL)
, _r(rhs._r)
, _c(rhs._c)
{
if(_columns)
memcpy(_columns, rhs._columns, sizeof(uint64_t) * _c);
}
RectangularBinaryMatrix(RectangularBinaryMatrix&& rhs) :
_columns(rhs._columns), _r(rhs._r), _c(rhs._c) {
RectangularBinaryMatrix(RectangularBinaryMatrix&& rhs)
: _columns(rhs._columns)
, _r(rhs._r)
, _c(rhs._c)
{
rhs._columns = 0;
}
// Initialize from raw data. raw must contain at least c words.
Expand All @@ -67,6 +81,16 @@ namespace jellyfish {
free(_columns);
}

static RectangularBinaryMatrix identity(unsigned c) {
return RectangularBinaryMatrix(c);
}

static RectangularBinaryMatrix identity(unsigned r, unsigned c) {
RectangularBinaryMatrix res(r, c);
res.init_low_identity();
return res;
}

RectangularBinaryMatrix &operator=(const RectangularBinaryMatrix &rhs) {
if(_r != rhs._r || _c != rhs._c)
throw std::invalid_argument("RHS matrix dimensions do not match");
Expand All @@ -90,7 +114,7 @@ namespace jellyfish {
}

// Get i-th column. No check on range
const uint64_t & operator[](unsigned int i) const { return _columns[i]; }
uint64_t operator[](unsigned int i) const { return _columns ? _columns[i] : ((uint64_t)1 << i); }

unsigned int r() const { return _r; }
unsigned int c() const { return _c; }
Expand All @@ -112,8 +136,8 @@ namespace jellyfish {

// Make and check that the matrix the lower right corner of the
// identity.
void init_low_identity();
bool is_low_identity();
void init_low_identity(bool simplify = true);
bool is_low_identity() const;

// Left matrix vector multiplication. Type T supports the operator
// v[i] to return the i-th 64 bit word of v.
Expand Down Expand Up @@ -204,6 +228,7 @@ namespace jellyfish {

template<typename T>
uint64_t RectangularBinaryMatrix::times_loop(const T &v) const {
if(!_columns) return v[0] & cmask();
uint64_t *p = _columns + _c - 1;
uint64_t res = 0, x = 0, j = 0;
const uint64_t one = (uint64_t)1;
Expand Down Expand Up @@ -244,6 +269,7 @@ namespace jellyfish {
#ifdef HAVE_SSE
template<typename T>
uint64_t RectangularBinaryMatrix::times_sse(const T &v) const {
if(!_columns) return v[0] & cmask();
#define FFs ((uint64_t)-1)
static const uint64_t smear[8] asm("smear") __attribute__ ((aligned(16),used)) =
{0, 0, 0, FFs, FFs, 0, FFs, FFs};
Expand Down Expand Up @@ -338,6 +364,7 @@ namespace jellyfish {
#ifdef HAVE_INT128
template<typename T>
uint64_t RectangularBinaryMatrix::times_128(const T &v) const {
if(!_columns) return v[0] & cmask();
typedef unsigned __int128 u128;
static const u128 smear[4] =
{ (u128)0,
Expand Down
2 changes: 1 addition & 1 deletion include/jellyfish/whole_sequence_parser.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ class whole_sequence_parser : public jellyfish::cooperative_pool2<whole_sequence
protected:
void open_next_file(stream_status& st) {
st.stream.reset();
st.stream = streams_iterator_.next();
st.stream = std::move(streams_iterator_.next());
if(!st.stream.good()) {
st.type = DONE_TYPE;
return;
Expand Down
9 changes: 8 additions & 1 deletion jellyfish/dbg.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,14 @@

#include <jellyfish/dbg.hpp>
#include <jellyfish/time.hpp>

#ifdef HAVE_CONFIG_H
#include <config.h>
#endif

#ifdef HAVE_SYS_SYSCALL_H
#include <sys/syscall.h>
#endif

namespace dbg {
pthread_mutex_t print_t::_lock = PTHREAD_MUTEX_INITIALIZER;
Expand All @@ -33,7 +40,7 @@ namespace dbg {
}
Time toc() {
#ifdef DEBUG
Time t;
Time t;
return t - _tic_time;
#else
return Time::zero;
Expand Down
Loading

0 comments on commit 68c9cb2

Please sign in to comment.