diff --git a/README.md b/README.md index 7b385564..91d5d2cf 100644 --- a/README.md +++ b/README.md @@ -2,10 +2,11 @@ ##Purpose -The MongoDB Connector for Hadoop is a library which allows MongoDB (or backup files in its data format, BSON) to be used as an input source, or output destination, for Hadoop MapReduce tasks. It is designed to allow greater flexibility and performance and make it easy to integrate data in MongoDB with other parts of the Hadoop ecosystem. +The MongoDB Connector for Hadoop is a library which allows MongoDB (or backup files in its data format, BSON) to be used as an input source, or output destination, for Hadoop MapReduce tasks. It is designed to allow greater flexibility and performance and make it easy to integrate data in MongoDB with other parts of the Hadoop ecosystem. Current stable release: **1.1** -Current unstable release: **1.2.0-rc0** + +Current unstable release: **1.2.0-rc1** ## Features @@ -19,7 +20,7 @@ Current unstable release: **1.2.0-rc0** ## Download * 0.20.x - + * [core](https://s3.amazonaws.com/drivers.mongodb.org/hadoop/mongo-hadoop-core_0.20.205.0-1.1.0.jar) * [pig support](https://s3.amazonaws.com/drivers.mongodb.org/hadoop/mongo-hadoop-pig_0.20.205.0-1.1.0.jar) * [hive support](https://s3.amazonaws.com/drivers.mongodb.org/hadoop/mongo-hadoop-hive_0.20.205.0-1.1.0.jar) @@ -84,7 +85,7 @@ After successfully building, you must copy the jars to the lib directory on each Does **not** support Hadoop Streaming. Build using `"1.0"` or `"1.0.x"` - + * ###Apache Hadoop 1.1 Includes support for Hadoop Streaming. @@ -93,38 +94,38 @@ After successfully building, you must copy the jars to the lib directory on each * ###Apache Hadoop 0.20.* Does **not** support Hadoop Streaming - + Includes Pig 0.9.2. - + Build using `"0.20"` or `"0.20.x"` - + * ###Apache Hadoop 0.23 Includes Pig 0.9.2. - + Includes support for Streaming - + Build using `"0.23"` or `"0.23.x"` * ###Apache Hadoop 0.21 Includes Pig 0.9.1 - + Includes support for Streaming - + Build using `"0.21"` or `"0.21.x"` * ###Cloudera Distribution for Hadoop Release 3 This is derived from Apache Hadoop 0.20.2 and includes custom patches. - + Includes support for streaming and Pig 0.8.1. Build with `"cdh3"` * ###Cloudera Distribution for Hadoop Release 4 - + This is the newest release from Cloudera which is based on Apache Hadoop 2.0. The newer MR2/YARN APIs are not yet supported, but MR1 is still fully compatible. - + Includes support for Streaming and Pig 0.11.1. - + Build with `"cdh4"` ## Configuration @@ -161,7 +162,7 @@ For examples on using Pig with the MongoDB Connector for Hadoop, also refer to t ## Notes for Contributors -If your code introduces new features, please add tests that cover them if possible and make sure that the existing test suite still passes. If you're not sure how to write a test for a feature or have trouble with a test failure, please post on the google-groups with details and we will try to help. +If your code introduces new features, please add tests that cover them if possible and make sure that the existing test suite still passes. If you're not sure how to write a test for a feature or have trouble with a test failure, please post on the google-groups with details and we will try to help. ### Maintainers Mike O'Brien (mikeo@10gen.com) diff --git a/project/MongoHadoopBuild.scala b/project/MongoHadoopBuild.scala index d6de25f8..7597a03a 100644 --- a/project/MongoHadoopBuild.scala +++ b/project/MongoHadoopBuild.scala @@ -8,7 +8,7 @@ import AssemblyKeys._ object MongoHadoopBuild extends Build { lazy val buildSettings = Seq( - version := "1.2.0-rc0", + version := "1.2.0-rc1", crossScalaVersions := Nil, crossPaths := false, organization := "org.mongodb" @@ -331,7 +331,7 @@ object MongoHadoopBuild extends Build { println("*** Adding Hive Dependency for Version '%s'".format(hiveVersion)) Seq( - "org.apache.hive" % "hive-serde" % hiveVersion, + "org.apache.hive" % "hive-serde" % hiveVersion, "org.apache.hive" % "hive-exec" % hiveVersion ) } @@ -366,7 +366,7 @@ object Resolvers { object Dependencies { val mongoJavaDriver = "org.mongodb" % "mongo-java-driver" % "2.11.3" val hiveSerDe = "org.apache.hive" % "hive-serde" % "0.10.0" - val hiveExec = "org.apache.hive" % "hive-exec" % "0.10.0" + val hiveExec = "org.apache.hive" % "hive-exec" % "0.10.0" val junit = "junit" % "junit" % "4.10" % "test" val flume = "com.cloudera" % "flume-core" % "0.9.4-cdh3u3" val casbah = "org.mongodb" %% "casbah" % "2.3.0" diff --git a/testing/run_treasury.py b/testing/run_treasury.py index a56829f9..8220f08b 100644 --- a/testing/run_treasury.py +++ b/testing/run_treasury.py @@ -23,8 +23,8 @@ CLEANUP_TMP=os.environ.get('CLEANUP_TMP', True) HADOOP_HOME=os.environ['HADOOP_HOME'] HADOOP_RELEASE=os.environ.get('HADOOP_RELEASE',None) -AWS_SECRET=os.environ.get('AWS_SECRET',None) -AWS_ACCESSKEY=os.environ.get('AWS_ACCESSKEY',None) +AWS_SECRET=os.environ.get('AWS_SECRET',None) +AWS_ACCESSKEY=os.environ.get('AWS_ACCESSKEY',None) TEMPDIR=os.environ.get('TEMPDIR','/tmp') USE_ASSEMBLY=os.environ.get('USE_ASSEMBLY', True) num_runs = 0 @@ -35,7 +35,7 @@ #declare -a job_args #cd .. -VERSION_SUFFIX = "1.2.0-rc0" +VERSION_SUFFIX = "1.2.0-rc1" def generate_id(size=6, chars=string.ascii_uppercase + string.digits): @@ -66,7 +66,7 @@ def generate_jar_name(prefix, version_suffix): streaming_jar_name = generate_jar_name("mongo-hadoop-streaming", VERSION_SUFFIX); # result set for sanity check#{{{ -check_results = [ { "_id": 1990, "count": 250, "avg": 8.552400000000002, "sum": 2138.1000000000004 }, +check_results = [ { "_id": 1990, "count": 250, "avg": 8.552400000000002, "sum": 2138.1000000000004 }, { "_id": 1991, "count": 250, "avg": 7.8623600000000025, "sum": 1965.5900000000006 }, { "_id": 1992, "count": 251, "avg": 7.008844621513946, "sum": 1759.2200000000005 }, { "_id": 1993, "count": 250, "avg": 5.866279999999999, "sum": 1466.5699999999997 }, @@ -87,7 +87,7 @@ def generate_jar_name(prefix, version_suffix): { "_id": 2008, "count": 251, "avg": 3.6642629482071714, "sum": 919.73 }, { "_id": 2009, "count": 250, "avg": 3.2641200000000037, "sum": 816.0300000000009 }, { "_id": 2010, "count": 189, "avg": 3.3255026455026435, "sum": 628.5199999999996 } ]#}}} - + def compare_results(collection, reference=check_results): output = list(collection.find().sort("_id")) if len(output) != len(reference): @@ -98,7 +98,7 @@ def compare_results(collection, reference=check_results): #round to account for slight changes due to precision in case ops are run in different order. if doc['_id'] != reference[i]['_id'] or \ doc['count'] != reference[i]['count'] or \ - round(doc['avg'], 7) != round(reference[i]['avg'], 7): + round(doc['avg'], 7) != round(reference[i]['avg'], 7): print "docs do not match", doc, reference[i] return False return True @@ -177,8 +177,8 @@ def runjob(hostname, params, input_collection='mongo_hadoop.yield_historical.in' cmd.append("-D") cmd.append(key + "=" + val) - - #if it's not set, assume that the test is + + #if it's not set, assume that the test is # probably setting it in some other property (e.g. multi collection) if input_collection: cmd.append("-D") @@ -186,7 +186,7 @@ def runjob(hostname, params, input_collection='mongo_hadoop.yield_historical.in' input_uri = " ".join('mongodb://%s/%s?readPreference=%s' % (hostname, x, readpref) for x in input_collection) input_uri = '"' + input_uri + '"' else: - input_uri = 'mongodb://%s%s/%s?readPreference=%s' % (input_auth + "@" if input_auth else '', hostname, input_collection, readpref) + input_uri = 'mongodb://%s%s/%s?readPreference=%s' % (input_auth + "@" if input_auth else '', hostname, input_collection, readpref) cmd.append("mongo.input.uri=%s" % input_uri) cmd.append("-D") @@ -225,13 +225,13 @@ def runbsonjob(input_path, params, hostname, print cmd subprocess.call(' '.join(cmd), shell=True) - + def runstreamingjob(hostname, params, input_collection='mongo_hadoop.yield_historical.in', output_collection='mongo_hadoop.yield_historical.out', readpref="primary", input_auth=None, - output_auth=None, + output_auth=None, inputpath='file://' + os.path.join(TEMPDIR, 'in'), outputpath='file://' + os.path.join(TEMPDIR, 'out'), inputformat='com.mongodb.hadoop.mapred.MongoInputFormat', @@ -250,9 +250,9 @@ def runstreamingjob(hostname, params, input_collection='mongo_hadoop.yield_histo cmd += ["-inputformat",inputformat] cmd += ["-outputformat",outputformat] cmd += ["-io", 'mongodb'] - input_uri = 'mongodb://%s%s/%s?readPreference=%s' % (input_auth + "@" if input_auth else '', hostname, input_collection, readpref) + input_uri = 'mongodb://%s%s/%s?readPreference=%s' % (input_auth + "@" if input_auth else '', hostname, input_collection, readpref) cmd += ['-jobconf', "mongo.input.uri=%s" % input_uri] - output_uri = "mongo.output.uri=mongodb://%s%s/%s" % (output_auth + "@" if output_auth else '', hostname, output_collection) + output_uri = "mongo.output.uri=mongodb://%s%s/%s" % (output_auth + "@" if output_auth else '', hostname, output_collection) cmd += ['-jobconf', output_uri] cmd += ['-jobconf', 'stream.io.identifier.resolver.class=com.mongodb.hadoop.streaming.io.MongoIdentifierResolver'] @@ -518,7 +518,7 @@ def test_treasury(self): #logging.info(doc['_id'], "was on shard ", self.shard1.name, "now on ", self.shard2.name) #print "inserting", doc destination['mongo_hadoop']['yield_historical.in'].insert(doc, safe=True) - + PARAMETERS = DEFAULT_PARAMETERS.copy() PARAMETERS['mongo.input.split.allow_read_from_secondaries'] = 'true' PARAMETERS['mongo.input.split.read_from_shards'] = 'true' @@ -574,7 +574,7 @@ def setUp(self): self.temp_outdir = tempfile.mkdtemp(prefix='hadooptest_', dir=TEMPDIR) mongo_manager.mongo_dump(self.server_hostname, "mongo_hadoop", "yield_historical.in", self.temp_outdir) - + def tearDown(self): logging.info("TestStaticBSON teardown") @@ -756,7 +756,7 @@ def test_treasury(self): logging.info("Testing standalone with authentication on") x = self.server.connection()['admin'].add_user("test_user","test_pw", roles=["clusterAdmin", "readWriteAnyDatabase"]) PARAMETERS = DEFAULT_PARAMETERS.copy() - PARAMETERS['mongo.auth.uri'] = 'mongodb://%s:%s@%s/admin' % ('test_user', 'test_pw', self.server_hostname) + PARAMETERS['mongo.auth.uri'] = 'mongodb://%s:%s@%s/admin' % ('test_user', 'test_pw', self.server_hostname) runjob(self.server_hostname, PARAMETERS) server_connection = self.server.connection()