Merge pull request #173 from telefonicaid/bug/172_run_jars_in_hdfs

bug/172_run_jars_in_hdfs
telefonicaid · Jun 9, 2016 · 5ad2ad2 · 5ad2ad2
2 parents 917ecf3 + 37bc57f
commit 5ad2ad2
Show file tree

Hide file tree

Showing 5 changed files with 54 additions and 29 deletions.
diff --git a/CHANGES_NEXT_RELEASE b/CHANGES_NEXT_RELEASE
@@ -9,3 +9,4 @@
 - [cosmos] [HARDENING] Fix the base path for readthedocks in mkdocs.yml (#164)
 - [cosmos] [HARDENING] Update the Quick Start Guide with regards to the new FIWARE Lab clusters (#167)
 - [cosmos] [HARDENING] Add Hive clients to resources folder (#168)
+- [cosmos-tidoop-api] [BUG] Run Java jars located in HDFS (#172)
diff --git a/cosmos-tidoop-api/src/cmd_runner.js b/cosmos-tidoop-api/src/cmd_runner.js
@@ -26,30 +26,47 @@
 // Module dependencies
 var spawn = require('child_process').spawn;
 
-function runHadoopJar(userId, jar, className, jarPath, input, output, callback) {
-    var params = ['-u', userId, 'hadoop', 'jar', jar, className, '-libjars', jarPath, input, output];
+function runHadoopJar(userId, jarName, jarInHDFS, className, libJarsName, libJarsInHDFS, input, output, callback) {
+    // Copy the jar from the HDFS user space
+    var params = ['-u', userId, 'hadoop', 'fs', '-copyToLocal', jarInHDFS, '/home/' + userId + '/' + jarName];
     var command = spawn('sudo', params);
-    var jobId = null;
 
-    // This function catches the stderr as it is being produced (console logs are printed in the stderr). At the moment
-    // of receiving the line containing the job ID, get it and return with no error (no error means the job could be
-    // run, independently of the final result of the job)
-    command.stderr.on('data', function (data) {
-        var dataStr = data.toString();
-        var magicString = 'Submitting tokens for job: ';
-        var indexOfJobId = dataStr.indexOf(magicString);
-
-        if(indexOfJobId >= 0) {
-            jobId = dataStr.substring(indexOfJobId + magicString.length, indexOfJobId + magicString.length + 22);
-            return callback(null, jobId);
-        } // if
-    });
+    command.on('close', function(code) {
+        // Copy the libjar from the HDFS user space
+        var params = ['-u', userId, 'hadoop', 'fs', '-copyToLocal', libJarsInHDFS, '/home/' + userId + '/' + libJarsName];
+        var command = spawn('sudo', params);
+
+        command.on('close', function(code) {
+            // Run the MR job
+            var params = ['-u', userId, 'hadoop', 'jar', '/home/' + userId + '/' + jarName, className, '-libjars', '/home/' + userId + '/' + libJarsName, input, output];
+            var command = spawn('sudo', params);
+            var jobId = null;
+
+            // This function catches the stderr as it is being produced (console logs are printed in the stderr). At the
+            // moment of receiving the line containing the job ID, get it and return with no error (no error means the
+            // job could be run, independently of the final result of the job)
+            command.stderr.on('data', function (data) {
+                var dataStr = data.toString();
+                var magicString = 'Submitting tokens for job: ';
+                var indexOfJobId = dataStr.indexOf(magicString);
+
+                if(indexOfJobId >= 0) {
+                    jobId = dataStr.substring(indexOfJobId + magicString.length, indexOfJobId + magicString.length + 22);
+                    var params = ['-u', userId, 'rm', '/home/' + userId + '/' + jarName];
+                    var command = spawn('sudo', params);
+                    var params = ['-u', userId, 'rm', '/home/' + userId + '/' + libJarsName];
+                    var command = spawn('sudo', params);
+                    return callback(null, jobId);
+                } // if
+            });
 
-    // This function catches the moment the command finishes. Return the error code if the job ID was never got
-    command.on('close', function (code) {
-        if (jobId === null) {
-            return callback(code, null);
-        } // if
+            // This function catches the moment the command finishes. Return the error code if the job ID was never got
+            command.on('close', function (code) {
+                if (jobId === null) {
+                    return callback(code, null);
+                } // if
+            });
+        });
     });
 } // runHadoopJar
 

diff --git a/cosmos-tidoop-api/src/tidoop_api.js b/cosmos-tidoop-api/src/tidoop_api.js
@@ -55,17 +55,24 @@ server.route({
     path: '/tidoop/v1/user/{userId}/jobs',
     handler: function (request, reply) {
         var userId = request.params.userId;
-        var jar = request.payload.jar;
+        var jarInHDFS = 'hdfs://' + config.storage_cluster.namenode_host + ':'
+            + config.storage_cluster.namenode_ipc_port + '/user/' + userId + '/' + request.payload.jar;
+        var splits = request.payload.jar.split("/");
+        var jarName = splits[splits.length - 1];
         var className = request.payload.class_name;
-        var libJars = request.payload.lib_jars;
+        var libJarsInHDFS = 'hdfs://' + config.storage_cluster.namenode_host + ':'
+            + config.storage_cluster.namenode_ipc_port + '/user/' + userId + '/' + request.payload.lib_jars;
+        var splits = request.payload.lib_jars.split("/");
+        var libJarsName = splits[splits.length - 1];
         var input = 'hdfs://' + config.storage_cluster.namenode_host + ':' + config.storage_cluster.namenode_ipc_port
             + '/user/' + userId + '/' + request.payload.input;
         var output = 'hdfs://' + config.storage_cluster.namenode_host + ':' + config.storage_cluster.namenode_ipc_port
             + '/user/' + userId + '/' + request.payload.output;
 
         logger.info('Request: POST /tidoop/v1/user/' + userId + '/jobs ' + request.payload);
 
-        cmdRunner.runHadoopJar(userId, jar, className, libJars, input, output, function(error, result) {
+        cmdRunner.runHadoopJar(userId, jarName, jarInHDFS, className, libJarsName, libJarsInHDFS, input, output,
+            function(error, result) {
             if (error && error >= 0) {
                 var response = '{"success":"false","error":' + error + '}';
                 logger.info(response);

diff --git a/doc/manuals/quick_start_guide_new.md b/doc/manuals/quick_start_guide_new.md
@@ -166,13 +166,13 @@ Coming soon.
 [Top](#top)
 
 ###<a name="section3.5"></a>Step 5: Run your first MapReduce job
-Several pre-loaded MapReduce examples can be found in every Hadoop distribution, typically in a Java `-jar` file called `hadoop-mapreduce-examples.jar`. In this case, the <i>Computing Endpoint</i> owns that file at:
+Several already developed MapReduce examples can be found in every Hadoop distribution, typically in a Java `.jar` file called `hadoop-mapreduce-examples.jar`. This file is copied to the HDFS space a user owns in FIWARE Lab, specifically under the `jars/` folder, so the `frb` user should have it copied to:
 
-    /usr/lib/hadoop-mapreduce/hadoop-mapreduce-examples.jar
+    hdfs:///user/frb/jars/hadoop-mapreduce-examples.jar
 
-For instance, you can run the <i>Word Count</i> example (this is also know as the "hello world" of Hadoop) by typing:
+Thus, you can run the <i>Word Count</i> example (this is also know as the "hello world" of Hadoop) by typing:
 
-    $ curl -X POST "http://computing.cosmos.lab.fiware.org:12000/tidoop/v1/user/frb/jobs" -d '{"jar":"/usr/lib/hadoop-mapreduce/hadoop-mapreduce-examples.jar","class_name":"wordcount","lib_jars":"/usr/lib/hadoop-mapreduce/hadoop-mapreduce-examples.jar","input":"testdir","output":"testoutput"}' -H "Content-Type: application/json" -H "X-Auth-Token: 3azH09G1PdaGmgBNODLOtxy52f5a00"
+    $ curl -X POST "http://computing.cosmos.lab.fiware.org:12000/tidoop/v1/user/frb/jobs" -d '{"jar":"jars/hadoop-mapreduce-examples.jar","class_name":"wordcount","lib_jars":"jars/hadoop-mapreduce-examples.jar","input":"testdir","output":"testoutput"}' -H "Content-Type: application/json" -H "X-Auth-Token: 3azH09G1PdaGmgBNODLOtxy52f5a00"
     {"success":"true","job_id": "job_1460639183882_0001"}
 
 As you can see, another REST API has been used, in this case the Tidoop REST API in the <i>Computing Endpoint</i>. The API allows you checking the status of the job as well:

diff --git a/mkdocs.yml b/mkdocs.yml
@@ -10,7 +10,7 @@ theme: readthedocs
 extra_css: ["https://fiware.org/style/fiware_readthedocs.css"]
 pages:
   - Home: index.md
-  - 'Quick Start Guide': 'quick_start_guide.md'
+  - 'Quick Start Guide': 'quick_start_guide_new.md'
   - 'Installation and Administration manual':
       - 'Introduction': 'installation_and_administration_manual/introduction.md'
       - 'Batch processing: Some words about Cosmos and its ecosystem': 'installation_and_administration_manual/batch/some_words_about_cosmos_and_ecosystem.md'