diff --git a/.gitignore b/.gitignore index 81bf736..c941b05 100644 --- a/.gitignore +++ b/.gitignore @@ -115,3 +115,4 @@ src/main/scala/tap/nlp/old/NlpDocument.scala src/main/scala/tap/nlp/old/OldNlpSentence.scala src/main/scala/tap/nlp/old/SentenceConverter.scala /src/main/scala/worksheet.sc +/src/main/scala/experiments.sc diff --git a/build.sbt b/build.sbt index 1f43d4a..495174a 100644 --- a/build.sbt +++ b/build.sbt @@ -16,7 +16,7 @@ name := "tap" -version := "3.0.7" +version := "3.0.8" scalaVersion := "2.12.3" @@ -51,16 +51,18 @@ val apiDependencies = Seq( ) val analyticsDependencies = Seq( - "io.nlytx" %% "factorie-nlp-api" % nlytxFactorieVersion, - "cc.factorie.app.nlp" % "all-models" % factorieVersion, + "io.nlytx" %% "nlytx-nlp-api" % "1.0.2", + "io.nlytx" %% "factorie-nlp-models" % "1.0.3", "com.typesafe.akka" % "akka-stream_2.12" % akkaStreamVersion, "org.apache.opennlp" % "opennlp-tools" % openNlpVersion, "org.languagetool" % "language-en" % langToolVersion ) +resolvers += Resolver.bintrayRepo("nlytx", "nlytx-nlp") val generalDependencies = Seq( "io.nlytx" %% "commons" % nlytxCommonsVersion ) +resolvers += Resolver.bintrayRepo("nlytx", "nlytx_commons") val testDependencies = Seq( "org.scalactic" %% "scalactic" % scalatestVersion, @@ -74,8 +76,8 @@ libraryDependencies ++= apiDependencies ++ analyticsDependencies ++ generalDepen scalacOptions in (Compile, doc) ++= Seq("-doc-root-content", baseDirectory.value+"/src/main/scala/root-doc.md") -resolvers += Resolver.bintrayRepo("nlytx", "nlytx_commons") -resolvers += Resolver.bintrayRepo("nlytx-io", "factorie-nlp-api") + + //Documentation - run ;paradox;copyDocs enablePlugins(ParadoxPlugin) //Generate documentation with Paradox diff --git a/project/build.properties b/project/build.properties index 9be35ab..9d2366e 100644 --- a/project/build.properties +++ b/project/build.properties @@ -15,4 +15,4 @@ # #sbt.version = 0.13.16 -sbt.version = 1.0.2 \ No newline at end of file +sbt.version = 1.0.3 \ No newline at end of file diff --git a/src/main/public/js/app-graphiql.js b/src/main/public/js/app-graphiql.js index 8592c67..4cb911f 100644 --- a/src/main/public/js/app-graphiql.js +++ b/src/main/public/js/app-graphiql.js @@ -34,31 +34,59 @@ "# Text Analytics Pipeline\n" + "\n" - var exampleQueries = "query CleanText($input: String!) {\n" + - " clean(text:$input) {\n" + + var exampleQueries = "# TAP Example Queries\n" + + "# -------------------\n" + + "# These queries are just examples of what TAP can do, the\n" + + "# actual capability of the server at any point in time can\n" + + "# be found in the Schema - see the Documentation Explorer\n" + + "# on the right hand side. \n" + + "# See the TAP Documentation https://uts-cic.github.io/tap/\n" + + "\n" + + "# All queries require submission of text and return\n" + + "# analytics,timestamp, message, querytime\n" + + "\n" + + "# Some queries like \"moves\" require external resources and may\n" + + "# fail if those resources are not available.\n" + + "\n" + + "query RhetoricalMoves($input: String!) {\n" + + " moves(text:$input,grammar:\"analytic\") {\n" + " analytics\n" + + " message\n" + " timestamp\n" + + " querytime\n" + " }\n" + - " cleanPreserve(text:$input) {\n" + - " analytics\n" + - " }\n" + - " cleanMinimal(text:$input) {\n" + - " analytics\n" + - " }\n" + - " cleanAscii(text:$input) {\n" + + "}\n" + + "# However, you only need to ask for what you want e.g.\n" + + "query MinimalMoves($input:String!) {\n" + + " moves(text:$input) {\n" + " analytics\n" + " }\n" + "}\n" + "\n" + - "query MakeVisible($input: String!) {\n" + - " visible(text:$input) {\n" + - " analytics\n" + - " timestamp\n" + + "# Tokenise with pipetype (default is 'fast'):\n" + + "# fast - lemmas and postags\n" + + "# standard - lemmas, postags, parse data\n" + + "# ner - lemmas, postags, parse data, nertags\n" + + "\n" + + "query Tokenise($input: String!) {\n" + + " annotations(text:$input) {\n" + + " analytics {\n" + + " idx\n" + + " start\n" + + " end\n" + + " length\n" + + " tokens {\n" + + " idx\n" + + " term\n" + + " lemma\n" + + " postag\n" + + " }\n" + + " }\n" + " }\n" + "}\n" + "\n" + - "query Annotations($input: String!) {\n" + - " annotations(text:$input) {\n" + + "query TokeniseWithNer($input: String!) {\n" + + " annotations(text:$input,pipetype:\"ner\") {\n" + " analytics {\n" + " idx\n" + " start\n" + @@ -70,14 +98,34 @@ " lemma\n" + " postag\n" + " parent\n" + - " child\n" + + " children\n" + " deptype\n" + + " nertag\n" + " }\n" + " }\n" + " timestamp\n" + " }\n" + "}\n" + "\n" + + "# Other examples\n" + + "query Expressions($input2:String!) {\n" + + " expressions(text:$input2) {\n" + + " analytics {\n" + + " sentIdx\n" + + " affect{\n" + + " text\n" + + " }\n" + + " epistemic {\n" + + " text\n" + + " startIdx\n" + + " endIdx\n" + + " }\n" + + " modal {\n" + + " text\n" + + " }\n" + + " }\n" + + " }\n" + + "}\n" + "query Vocab($input: String!) {\n" + " vocabulary(text:$input){\n" + " analytics {\n" + @@ -90,9 +138,8 @@ " timestamp\n" + " }\n" + "}\n" + - "\n" + - "query Metrics($input2: String!) {\n" + - " metrics(text:$input2) {\n" + + "query Metrics($input: String!) {\n" + + " metrics(text:$input) {\n" + " analytics {\n" + " sentences\n" + " tokens\n" + @@ -109,47 +156,16 @@ " timestamp\n" + " }\n" + "}\n" + - "\n" + - "query Athanor($input: String!) {\n" + - " moves(text:$input) {\n" + - " analytics\n" + - " }\n" + - "}\n" + - "\n" + - "query Expressions($input:String!) {\n" + - " expressions(text:$input) {\n" + + "query PosStats($input:String!){\n" + + " posStats(text:$input) {\n" + " analytics {\n" + - " sentIdx\n" + - " affect{\n" + - " text\n" + - " }\n" + - " epistemic {\n" + - " text\n" + - " startIdx\n" + - " endIdx\n" + - " }\n" + - " modal {\n" + - " text\n" + - " }\n" + - " }\n" + - " }\n" + - "}\n" + - "\n" + - "query Expressions2($input2:String!) {\n" + - " expressions(text:$input2) {\n" + - " analytics {\n" + - " sentIdx\n" + - " affect{\n" + - " text\n" + - " }\n" + - " epistemic {\n" + - " text\n" + - " startIdx\n" + - " endIdx\n" + - " }\n" + - " modal {\n" + - " text\n" + - " }\n" + + " verbNounRatio\n" + + " futurePastRatio\n" + + " adjectiveWordRatio\n" + + " namedEntityWordRatio\n" + + " nounDistribution\n" + + " verbDistribution\n" + + " adjectiveDistribution\n" + " }\n" + " }\n" + "}\n" + @@ -178,11 +194,37 @@ " }\n" + " }\n" + " }\n" + + "}\n" + + "\n" + + "\n" + + "##############################\n" + + "# UTILITY QUERIES\n" + + "\n" + + "query MakeVisible($input: String!) {\n" + + " visible(text:$input) {\n" + + " analytics\n" + + " timestamp\n" + + " }\n" + + "}\n" + + "query AllCleaning($input: String!) {\n" + + " clean(text:$input) {\n" + + " analytics\n" + + " timestamp\n" + + " }\n" + + " cleanPreserve(text:$input) {\n" + + " analytics\n" + + " }\n" + + " cleanMinimal(text:$input) {\n" + + " analytics\n" + + " }\n" + + " cleanAscii(text:$input) {\n" + + " analytics\n" + + " }\n" + "}" + "\n" - var exampleVariables = "{\"input\": \"I didn't take any time to review the subject outline nor did I log onto UTS Online to review any supporting information to provide context, I walked into class like a blank canvas. I had no idea what this course was about but I was certain it had something to do with responsibility and leaders. I reflected on this and felt decision making was like second nature, yes I over-thought my decisions whether it was personal or professional but I never thought of the act of having to justify my decisions.\"," + - "\"input2\": \"Although I wasn't certain, I did believe that I was doing the right thing. Next time I will be sure.\"}" + var exampleVariables = "{\"input\": \"It didn't take any time for Dr. Smith to review the subject outline by logging onto UTS Online. However, I walked into class like a blank canvas. I had no idea what this course was about but I was certain it had something to do with responsibility and leaders. I reflected on this and felt decision making was like second nature, yes I over-thought my decisions whether it was personal or professional but I never thought of the act of having to justify my decisions.\"," + + "\"input2\": \"Althogh I wasn't certain, I did believe that I was doing the right thing. Next time I will be sure.\"}" diff --git a/src/main/scala/handlers/ExternalAnalysisHandler.scala b/src/main/scala/handlers/ExternalAnalysisHandler.scala index e23ad8a..44436d5 100644 --- a/src/main/scala/handlers/ExternalAnalysisHandler.scala +++ b/src/main/scala/handlers/ExternalAnalysisHandler.scala @@ -16,15 +16,13 @@ package handlers -import java.io.File import javax.inject.Inject -import tap.pipelines.materialize.PipelineContext.{executor, materializer} -import com.typesafe.config.ConfigFactory -import models.Results.{StringListResult, StringResult} -import play.api.{Configuration, Environment, Logger, Mode} -import play.api.libs.ws.ahc.{AhcWSClient, AhcWSClientConfig, AhcWSClientConfigFactory} +import models.Results.StringListResult +import play.api.Logger import play.api.libs.ws.{WSClient, WSRequest, WSResponse} +import tap.pipelines.materialize.PipelineContext.executor + import scala.concurrent.Future import scala.concurrent.duration.DurationInt @@ -37,42 +35,47 @@ class ExternalAnalysisHandler @Inject() (wsClient: WSClient) { def analyseWithAthanor(text:String,grammar:Option[String]):Future[StringListResult] = { //logger.info(s"Analysing with athanor: $text") - val parameter = "?grammar="+grammar.getOrElse("analytic") - val url = "http://athanor.utscic.edu.au/v2/analyse/text/rhetorical"+parameter + val parameter = "?grammar=" + grammar.getOrElse("analytic") + val url = "http://athanor.utscic.edu.au/v2/analyse/text/rhetorical" + parameter logger.info(s"Creating request to: $url") val request: WSRequest = wsClient.url(url) val athanorRequest: WSRequest = request - .withHttpHeaders("Accept" -> "application/json") - .withRequestTimeout(10000.millis) + .withHttpHeaders("Accept" -> "application/json") + .withRequestTimeout(30000.millis) val futureResponse: Future[WSResponse] = athanorRequest.post(text) + //try { + case class AthanorMsg(message: String, results: Vector[Vector[String]]) - case class AthanorMsg(message:String, results:List[List[String]]) - - import play.api.libs.functional.syntax._ //scalastyle:ignore - import play.api.libs.json._ //scalastyle:ignore + import play.api.libs.functional.syntax._ + import play.api.libs.json._ //scalastyle:ignore implicit val AMWrites: Writes[AthanorMsg] = ( (JsPath \ "message").write[String] and - (JsPath \ "results").write[List[List[String]]] - )(unlift(AthanorMsg.unapply)) + (JsPath \ "results").write[Vector[Vector[String]]] + ) (unlift(AthanorMsg.unapply)) - implicit val AMReads:Reads[AthanorMsg] = ( + implicit val AMReads: Reads[AthanorMsg] = ( (JsPath \ "message").read[String] and - (JsPath \ "results").read[List[List[String]]] - )(AthanorMsg.apply _) - - val result:Future[List[List[String]]] = futureResponse.map { response => - response.json.as[AthanorMsg].results + (JsPath \ "results").read[Vector[Vector[String]]] + ) (AthanorMsg.apply _) + logger.warn("About to try and get result...") + val result: Future[StringListResult] = { + futureResponse.map { response => + val res = response.json.as[AthanorMsg].results + StringListResult(res,"ok") + } + val errMsg = "There was a problem connecting to the Athanor server." + futureResponse.recover { + case e: Any => { + val msg = s"$errMsg: $e" + logger.error(msg) + StringListResult(Vector(),msg) + } + }.asInstanceOf[Future[StringListResult]] } - - result.map(s => StringListResult(s)) + result } -// def analyseWithXip(text:String):Future[StringResult] = Future { -// -// StringResult("") -// } - } diff --git a/src/main/scala/handlers/TextAnalysisHandler.scala b/src/main/scala/handlers/TextAnalysisHandler.scala index 4e0dac4..f8f82b6 100644 --- a/src/main/scala/handlers/TextAnalysisHandler.scala +++ b/src/main/scala/handlers/TextAnalysisHandler.scala @@ -18,33 +18,39 @@ package handlers import javax.inject.Inject +import models.Results._ // scalastyle:ignore import tap.pipelines.materialize.TextPipeline -import tap.pipelines.{Cleaning, Annotating} -import models.Results._ +import tap.pipelines.{Annotating, Cleaning} +import tap.pipelines.AnnotatingTypes.{DEFAULT,validPipeType} -import scala.concurrent.Future import scala.concurrent.ExecutionContext.Implicits.global +import scala.concurrent.Future /** * Created by andrew@andrewresearch.net on 20/2/17. */ -class TextAnalysisHandler @Inject() (cleaning: Cleaning, annotating: Annotating) { +class TextAnalysisHandler @Inject() (clean: Cleaning, annotate: Annotating) { + + private val pipe = annotate.Pipeline /* Cleaning Pipeline */ - def visible(text:String):Future[StringResult] = TextPipeline(text,cleaning.Pipeline.revealInvisible).run.map(StringResult(_)) - def clean(text:String):Future[StringResult] = TextPipeline(text,cleaning.Pipeline.utfSimplify).run.map(StringResult(_)) - def cleanPreserve(text:String):Future[StringResult] = TextPipeline(text,cleaning.Pipeline.lengthPreserve).run.map(StringResult(_)) - def cleanMinimal(text:String):Future[StringResult] = TextPipeline(text,cleaning.Pipeline.utfMinimal).run.map(StringResult(_)) - def cleanAscii(text:String):Future[StringResult] = TextPipeline(text,cleaning.Pipeline.asciiOnly).run.map(StringResult(_)) + def visible(text:String):Future[StringResult] = TextPipeline(text,clean.Pipeline.revealInvisible).run.map(StringResult(_)) + def clean(text:String):Future[StringResult] = TextPipeline(text,clean.Pipeline.utfSimplify).run.map(StringResult(_)) + def cleanPreserve(text:String):Future[StringResult] = TextPipeline(text,clean.Pipeline.lengthPreserve).run.map(StringResult(_)) + def cleanMinimal(text:String):Future[StringResult] = TextPipeline(text,clean.Pipeline.utfMinimal).run.map(StringResult(_)) + def cleanAscii(text:String):Future[StringResult] = TextPipeline(text,clean.Pipeline.asciiOnly).run.map(StringResult(_)) /* Annotating Pipeline */ - def sentences(text:String):Future[SentencesResult] = TextPipeline(text,annotating.Pipeline.sentences).run.map(SentencesResult(_)) - def expressions(text:String):Future[ExpressionsResult] = TextPipeline(text,annotating.Pipeline.expressions).run.map(ExpressionsResult(_)) - def syllables(text:String):Future[SyllablesResult] = TextPipeline(text,annotating.Pipeline.syllables).run.map(SyllablesResult(_)) - def spelling(text:String):Future[SpellingResult] = TextPipeline(text,annotating.Pipeline.spelling).run.map(SpellingResult(_)) - def vocabulary(text:String):Future[VocabResult] = TextPipeline(text,annotating.Pipeline.vocab).run.map(VocabResult(_)) - def metrics(text:String):Future[MetricsResult] = TextPipeline(text,annotating.Pipeline.metrics).run.map(MetricsResult(_)) - def posStats(text:String):Future[PosStatsResult] = TextPipeline(text,annotating.Pipeline.posStats).run.map(PosStatsResult(_)) + def annotations(text:String,pipetype:Option[String]):Future[SentencesResult] = { + TextPipeline(text, annotate.build(validPipeType(pipetype),pipe.sentences)).run.map(SentencesResult(_)) + } + // DEFAULT pipetypes don't require parsing or NER so can use the FAST (DEFAULT) option + def expressions(text:String):Future[ExpressionsResult] = TextPipeline(text,annotate.build(DEFAULT,pipe.expressions)).run.map(ExpressionsResult(_)) + def syllables(text:String):Future[SyllablesResult] = TextPipeline(text,annotate.build(DEFAULT,pipe.syllables)).run.map(SyllablesResult(_)) + def spelling(text:String):Future[SpellingResult] = TextPipeline(text,annotate.build(DEFAULT,pipe.spelling)).run.map(SpellingResult(_)) + def vocabulary(text:String):Future[VocabResult] = TextPipeline(text,annotate.build(DEFAULT,pipe.vocab)).run.map(VocabResult(_)) + def metrics(text:String):Future[MetricsResult] = TextPipeline(text,annotate.build(DEFAULT,pipe.metrics)).run.map(MetricsResult(_)) + def posStats(text:String):Future[PosStatsResult] = TextPipeline(text,annotate.build(DEFAULT,pipe.posStats)).run.map(PosStatsResult(_)) //TODO To be implemented diff --git a/src/main/scala/models/GraphqlActions.scala b/src/main/scala/models/GraphqlActions.scala index 7bbbe9f..b41f456 100644 --- a/src/main/scala/models/GraphqlActions.scala +++ b/src/main/scala/models/GraphqlActions.scala @@ -35,7 +35,8 @@ class GraphqlActions @Inject() (textAnalysisHandler: TextAnalysisHandler, extern def cleanPreserve(text:String):Future[StringResult] = textAnalysisHandler.cleanPreserve(text) def cleanMinimal(text:String):Future[StringResult] = textAnalysisHandler.cleanMinimal(text) def cleanAscii(text:String):Future[StringResult] = textAnalysisHandler.cleanAscii(text) - def sentences(text:String):Future[SentencesResult] = textAnalysisHandler.sentences(text) + + def annotations(text:String,pipetype:Option[String]):Future[SentencesResult] = textAnalysisHandler.annotations(text,pipetype) def vocabulary(text:String):Future[VocabResult] = textAnalysisHandler.vocabulary(text) def metrics(text:String):Future[MetricsResult] = textAnalysisHandler.metrics(text) def expressions(text:String):Future[ExpressionsResult] = textAnalysisHandler.expressions(text) diff --git a/src/main/scala/models/GraphqlSchema.scala b/src/main/scala/models/GraphqlSchema.scala index b66de4c..1cc827d 100644 --- a/src/main/scala/models/GraphqlSchema.scala +++ b/src/main/scala/models/GraphqlSchema.scala @@ -32,6 +32,7 @@ class GraphqlSchema { val inputText:Argument[String] = Argument("text", StringType) val moveGrammar:Argument[Option[String]] = Argument("grammar",OptionInputType(StringType)) + val pipetype:Argument[Option[String]] = Argument("pipetype",OptionInputType(StringType)) val allFields = fields[GraphqlActions,Unit]( Field("visible", StringResultType, @@ -49,9 +50,10 @@ class GraphqlSchema { Field("cleanAscii",StringResultType, Some("Returns ascii safe cleaned text"), arguments = inputText :: Nil, resolve = c => c.ctx.cleanAscii(c arg inputText)), + Field("annotations", deriveObjectType[Unit,SentencesResult](Interfaces[Unit,SentencesResult](ResultType)), Some("Returns sentences for text"), - arguments = inputText :: Nil, resolve = c => c.ctx.sentences(c arg inputText)), + arguments = inputText :: pipetype :: Nil, resolve = c => c.ctx.annotations(c arg inputText,c arg pipetype)), Field("vocabulary",deriveObjectType[Unit,VocabResult](Interfaces[Unit,VocabResult](ResultType)), description = Some("Returns vocabulary for text"), arguments = inputText :: Nil, resolve = c => c.ctx.vocabulary(c arg inputText)), @@ -70,6 +72,7 @@ class GraphqlSchema { Field("posStats",deriveObjectType[Unit,PosStatsResult](Interfaces[Unit,PosStatsResult](ResultType)), Some("Returns posStats for text"), arguments = inputText :: Nil, resolve = c => c.ctx.posStats(c arg inputText)), + Field("moves",deriveObjectType[Unit,StringListResult](Interfaces[Unit,StringListResult](ResultType)), description = Some("Returns a list of moves for the input text"), arguments = inputText :: moveGrammar :: Nil, resolve = c => c.ctx.moves(c arg inputText,c arg moveGrammar)) diff --git a/src/main/scala/models/Results.scala b/src/main/scala/models/Results.scala index f6bf51c..09221e4 100644 --- a/src/main/scala/models/Results.scala +++ b/src/main/scala/models/Results.scala @@ -31,30 +31,31 @@ object Results { trait Result { val analytics: Any val timestamp: String = OffsetDateTime.now().toString - val querytime: Int = -1 - val message: String = "" + val querytime: Int + val message: String } - case class StringResult(analytics: String) extends Result - val StringResultType = deriveObjectType[Unit,StringResult](Interfaces[Unit,StringResult](ResultType)) + case class StringResult(analytics: String, message:String = "", querytime:Int = -1) extends Result - case class StringListResult(analytics: List[List[String]]) extends Result + case class StringListResult(analytics: Vector[Vector[String]], message:String = "", querytime:Int = -1) extends Result - case class SentencesResult(analytics: List[TapSentence]) extends Result + case class SentencesResult(analytics: Vector[TapSentence], message:String = "", querytime:Int = -1) extends Result - case class VocabResult(analytics: TapVocab) extends Result + case class VocabResult(analytics: TapVocab, message:String = "", querytime:Int = -1) extends Result - case class MetricsResult(analytics: TapMetrics) extends Result + case class MetricsResult(analytics: TapMetrics, message:String = "", querytime:Int = -1) extends Result - case class PosStatsResult(analytics: TapPosStats) extends Result + case class PosStatsResult(analytics: TapPosStats, message:String = "", querytime:Int = -1) extends Result - case class ExpressionsResult(analytics: List[TapExpressions]) extends Result + case class ExpressionsResult(analytics: Vector[TapExpressions], message:String = "", querytime:Int = -1) extends Result - case class SpellingResult(analytics: List[TapSpelling]) extends Result + case class SpellingResult(analytics: Vector[TapSpelling], message:String = "", querytime:Int = -1) extends Result @GraphQLName("syllables") @GraphQLDescription("Get syllable counts and averages.") - case class SyllablesResult(analytics: List[TapSyllables]) extends Result + case class SyllablesResult(analytics: Vector[TapSyllables], message:String = "", querytime:Int = -1) extends Result + + val StringResultType = deriveObjectType[Unit,StringResult](Interfaces[Unit,StringResult](ResultType)) object Implicits { implicit val ResultType:InterfaceType[Unit,Result] = InterfaceType( @@ -64,7 +65,6 @@ object Results { Field("message", StringType, resolve = _.value.message) ) ) - implicit val TokenType:ObjectType[Unit,TapToken] = deriveObjectType[Unit,TapToken]() implicit val SentenceType:ObjectType[Unit,TapSentence] = deriveObjectType[Unit,TapSentence]() implicit val TermCountType:ObjectType[Unit,TermCount] = deriveObjectType[Unit,TermCount]() diff --git a/src/main/scala/tap/data/TapVocab.scala b/src/main/scala/tap/data/TapVocab.scala index 88297a1..17b8656 100644 --- a/src/main/scala/tap/data/TapVocab.scala +++ b/src/main/scala/tap/data/TapVocab.scala @@ -21,5 +21,5 @@ package tap.data */ case class TermCount(term:String,count:Int) -case class CountTerms(count:Int,terms:List[String]) -case class TapVocab(unique: Int, terms: List[TermCount]) \ No newline at end of file +case class CountTerms(count:Int,terms:Vector[String]) +case class TapVocab(unique: Int, terms: Vector[TermCount]) \ No newline at end of file diff --git a/src/main/scala/tap/pipelines/Annotating.scala b/src/main/scala/tap/pipelines/Annotating.scala index 804bb17..0b842a1 100644 --- a/src/main/scala/tap/pipelines/Annotating.scala +++ b/src/main/scala/tap/pipelines/Annotating.scala @@ -23,97 +23,131 @@ import akka.actor.ActorRef import akka.pattern.ask import akka.stream.scaladsl.Flow import akka.util.Timeout -import cc.factorie.app.nlp.{Document, Section} -import io.nlytx.factorie_nlp_api.AnnotatorPipelines +import io.nlytx.nlp.api.AnnotatorPipelines +import io.nlytx.nlp.api.DocumentModel.{Document, Token} import play.api.Logger import tap.analysis.Syllable import tap.data._ // scalastyle:ignore import tap.nlp.factorie.LanguageToolActor.{CheckSpelling, INIT} +import tap.pipelines.AnnotatingTypes._ // scalastyle:ignore + import scala.concurrent.ExecutionContext.Implicits.global import scala.concurrent.Future import scala.concurrent.duration.DurationInt import scala.language.postfixOps -import scala.util.{Failure, Success} +import scala.util.{Failure, Success, Try} /** * Created by andrew@andrewresearch.net on 6/9/17. */ + class Annotating @Inject()(@Named("languagetool") languageTool: ActorRef) { val logger: Logger = Logger(this.getClass) + /* The main pipelines for performing annotating text analysis */ + object Pipeline { + val sentences: SentencesFlow = tapSentences + val vocab: VocabFlow = tapSentences via tapVocab + val metrics: MetricsFlow = tapSentences via tapMetrics + val expressions: ExpressionsFlow = tapSentences via tapExpressions + val syllables: SyllablesFlow = tapSentences via tapSyllables + val spelling: SpellingFlow = tapSentences via tapSpelling + val posStats: PosStatsFlow = tapSentences via tapPosStats + } + + def build[T](pipetype:String,pipeline: Flow[Document,T,NotUsed]):Flow[String,T,NotUsed] = { + val makeDoc = pipetype match { + case STANDARD => makeDocument + case FAST => makeFastDocument + case NER => makeNerDocument + case _ => makeFastDocument + } + makeDoc via pipeline + } + + /* Initialise required analytics processes */ + /* Running LanguageTool in an Actor */ //TODO Perhaps look at a streamed implementation? implicit val timeout: Timeout = 120 seconds val languageToolInitialised:Future[Boolean] = ask(languageTool,INIT).mapTo[Boolean] - languageToolInitialised.onComplete{ + languageToolInitialised.onComplete { case Success(result) => logger.info(s"LanguageTool initialised successfully: $result") case Failure(e) => logger.error("LanguageTool encountered an error on startup: " + e.toString) } /* Initialise Factorie models by running a test through docBuilder */ logger.info("Initialising Factorie models") - val ap = AnnotatorPipelines - val docStart = Future(ap.profile("Please start Factorie!",ap.postagPipeline,180)) + private val ap = AnnotatorPipelines + val docStart = Future(ap.profile("Please start Factorie!",ap.fastPipeline)) docStart.onComplete{ case Success(doc) => logger.info(s"Factorie started successfully [${doc.tokenCount} tokens]") case Failure(e) => logger.error("Factorie start failure:" + e.toString) } - /* The main pipelines for performing annotating text analysis */ - object Pipeline { - val sentences: Flow[String, List[TapSentence], NotUsed] = makeDocument via tapSentences - val vocab: Flow[String, TapVocab, NotUsed] = makeDocument via tapSentences via tapVocab - val metrics: Flow[String, TapMetrics, NotUsed] = makeDocument via tapSentences via tapMetrics - val expressions: Flow[String, List[TapExpressions], NotUsed] = makeDocument via tapSentences via tapExpressions - val syllables: Flow[String,List[TapSyllables],NotUsed] = makeDocument via tapSentences via tapSyllables - val spelling: Flow[String,List[TapSpelling],NotUsed] = makeDocument via tapSentences via tapSpelling - val posStats: Flow[String, TapPosStats, NotUsed] = makeDocument via tapSentences via tapPosStats - } - /* The following are essentially pipeline segments that can be re-used in the above pipelines + + + + /* The following are pipeline segments that can be re-used in the above pipelines * */ - val makeDocument: Flow[String, Document, NotUsed] = Flow[String] + val makeDocument: DocumentFlow = Flow[String] .mapAsync[Document](2) { text => - ap.process(text,ap.defaultPipeline) + ap.process(text,ap.parserPipeline) } - val sections: Flow[Document,List[Section],NotUsed] = Flow[Document] - .map(_.sections.toList) + //If parser info is NOT required + val makeFastDocument: DocumentFlow = Flow[String] + .mapAsync[Document](2) { text => + ap.process(text,ap.defaultPipeline) + } + + //If nertags ARE required + val makeNerDocument: DocumentFlow = Flow[String] + .mapAsync[Document](2) { text => + ap.process(text,ap.completePipeline) + } - val tapSentences: Flow[Document, List[TapSentence], NotUsed] = Flow[Document] + val sections: Flow[Document,Sections,NotUsed] = Flow[Document] + .map(_.sections.toVector) + + val tapSentences: SentencesFlow = Flow[Document] .map { doc => - doc.sentences.toList + doc.sentences } .map { sentList => sentList.zipWithIndex.map { case(s,idx) => - val tokens = s.tokens.toList.map { t => - val children = t.parseChildren.map(_.positionInSentence).toVector + val tokens = s.tokens.toVector.map { t => + val (children,parent,parseLabel) = getParseData(t).toOption.getOrElse((Vector(),-1,"")) + val nerTag = Try(t.nerTag.baseCategoryValue).toOption.getOrElse("") TapToken(t.positionInSentence,t.string,t.lemmaString,t.posTag.value.toString, - t.nerTag.baseCategoryValue,t.parseParentIndex,children,t.parseLabel.value.toString(),t.isPunctuation) - }.toVector + nerTag,parent,children,parseLabel,t.isPunctuation) + } TapSentence(s.documentString ,tokens, s.start, s.end, s.length, idx) - } + }.toVector } + private def getParseData(t:Token):Try[(Vector[Int],Int,String)] = Try { + (t.parseChildren.toVector.map(_.positionInSentence),t.parseParentIndex,t.parseLabel.value.toString) + } - - val tapVocab: Flow[List[TapSentence], TapVocab, NotUsed] = Flow[List[TapSentence]] - .map { lst => - lst.flatMap(_.tokens) + val tapVocab: Flow[TapSentences, TapVocab, NotUsed] = Flow[TapSentences] + .map { v => + v.flatMap(_.tokens) .map(_.term.toLowerCase) .groupBy((term: String) => term) .mapValues(_.length) }.map { m => - val lst: List[TermCount] = m.toList.map { case (k, v) => TermCount(k, v) } + val lst: Vector[TermCount] = m.toVector.map { case (k, v) => TermCount(k, v) } TapVocab(m.size, lst) } - val tapMetrics: Flow[List[TapSentence], TapMetrics, NotUsed] = Flow[List[TapSentence]] - .map { lst => - lst.map { s => + val tapMetrics: Flow[TapSentences, TapMetrics, NotUsed] = Flow[TapSentences] + .map { v => + v.map { s => val tokens:Int = s.tokens.length val characters:Int = s.original.length val punctuation:Int = s.tokens.count(_.isPunctuation) @@ -127,20 +161,20 @@ class Annotating @Inject()(@Named("languagetool") languageTool: ActorRef) { } .map { res => val sentCount:Int = res.length - val sentWordCounts = res.map(_._2).toVector + val sentWordCounts = res.map(_._2) val wordCount = sentWordCounts.sum val averageSentWordCount = wordCount / sentCount.toDouble - val wordLengths = res.map(_._6).toVector + val wordLengths = res.map(_._6) val averageWordLength = wordLengths.flatten.sum / wordCount.toDouble - val averageSentWordLength = res.map(_._7).toVector + val averageSentWordLength = res.map(_._7) TapMetrics(res.length, res.map(_._1).sum, wordCount,res.map(_._3).sum, res.map(_._4).sum, res.map(_._5).sum, sentWordCounts, averageSentWordCount, wordLengths ,averageWordLength,averageSentWordLength) } - val tapExpressions: Flow[List[TapSentence], List[TapExpressions], NotUsed] = Flow[List[TapSentence]] - .mapAsync[List[TapExpressions]](3) { lst => - val results = lst.map { sent => + val tapExpressions: Flow[TapSentences, Vector[TapExpressions], NotUsed] = Flow[TapSentences] + .mapAsync[Vector[TapExpressions]](3) { v => + val results = v.map { sent => for { ae <- Expressions.affect(sent.tokens) ee <- Expressions.epistemic(sent.tokens) @@ -150,27 +184,27 @@ class Annotating @Inject()(@Named("languagetool") languageTool: ActorRef) { Future.sequence(results) } - val tapSyllables: Flow[List[TapSentence],List[TapSyllables], NotUsed] = Flow[List[TapSentence]] - .map { lst => - lst.map { sent => + val tapSyllables: Flow[TapSentences,Vector[TapSyllables], NotUsed] = Flow[TapSentences] + .map { v => + v.map { sent => val counts = sent.tokens.map( t => Syllable.count(t.term.toLowerCase)).filterNot(_ == 0) val avg = counts.sum / sent.tokens.length.toDouble TapSyllables(sent.idx,avg,counts) } } - val tapSpelling: Flow[List[TapSentence],List[TapSpelling],NotUsed] = Flow[List[TapSentence]] - .mapAsync[List[TapSpelling]](2) { lst => - val checked = lst.map { sent => + val tapSpelling: Flow[TapSentences,Vector[TapSpelling],NotUsed] = Flow[TapSentences] + .mapAsync[Vector[TapSpelling]](2) { v => + val checked = v.map { sent => ask(languageTool,CheckSpelling(sent.original)).mapTo[Vector[TapSpell]].map(sp => TapSpelling(sent.idx,sp)) } Future.sequence(checked) } - val tapPosStats:Flow[List[TapSentence], TapPosStats, NotUsed] = Flow[List[TapSentence]] - .map { lst => - val stats = lst.map { s => + val tapPosStats:Flow[TapSentences, TapPosStats, NotUsed] = Flow[TapSentences] + .map { v => + val stats = v.map { s => val ts = s.tokens val tokens:Int = ts.length val punctuation:Int = ts.count(_.isPunctuation) @@ -184,11 +218,11 @@ class Annotating @Inject()(@Named("languagetool") languageTool: ActorRef) { val words = stats.map(_._1) val ners = stats.map(_._2) val verbs = stats.map(_._3) - val verbDist = verbs.map(_ / verbs.sum.toDouble).toVector + val verbDist = verbs.map(_ / verbs.sum.toDouble) val nouns = stats.map(_._4) - val nounDist = nouns.map(_ / nouns.sum.toDouble).toVector + val nounDist = nouns.map(_ / nouns.sum.toDouble) val adjs = stats.map(_._5) - val adjDist = adjs.map(_ / adjs.sum.toDouble).toVector + val adjDist = adjs.map(_ / adjs.sum.toDouble) val verbNounRatio = verbs.sum / nouns.sum.toDouble val futurePastRatio = 0.0 val nerWordRatio = ners.sum / words.sum.toDouble @@ -196,9 +230,6 @@ class Annotating @Inject()(@Named("languagetool") languageTool: ActorRef) { TapPosStats(verbNounRatio,futurePastRatio,nerWordRatio,adjWordRatio,nounDist,verbDist,adjDist) } - val addNer:Flow[Document,Document,NotUsed] = Flow[Document].map(d => d) - - val addParse:Flow[Document,Document,NotUsed] = Flow[Document].map(d => d) } diff --git a/src/main/scala/tap/pipelines/AnnotatingTypes.scala b/src/main/scala/tap/pipelines/AnnotatingTypes.scala new file mode 100644 index 0000000..04bab34 --- /dev/null +++ b/src/main/scala/tap/pipelines/AnnotatingTypes.scala @@ -0,0 +1,51 @@ +/* + * Copyright 2016-2017 original author or authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package tap.pipelines + +import akka.NotUsed +import akka.stream.scaladsl.Flow +import io.nlytx.nlp.api.DocumentModel.{Document, Section} +import tap.data._ // scalastyle:ignore + +/** + * Created by andrew@andrewresearch.net on 6/11/17. + */ + +object AnnotatingTypes { + /* pipetype values */ + val STANDARD = "standard" + val FAST = "fast" + val NER = "ner" + val DEFAULT = FAST + def validPipeType(pipetype:Option[String]):String = { + val pt = pipetype.getOrElse(DEFAULT).toLowerCase + if(List(STANDARD,FAST,NER).contains(pt)) pt else DEFAULT + } + + /* Some convenience types */ + type TapSentences = Vector[TapSentence] + type Sections = Vector[Section] + + type DocumentFlow = Flow[String, Document, NotUsed] + type SentencesFlow = Flow[Document, TapSentences, NotUsed] + type VocabFlow = Flow[Document, TapVocab, NotUsed] + type MetricsFlow = Flow[Document, TapMetrics, NotUsed] + type ExpressionsFlow = Flow[Document, Vector[TapExpressions], NotUsed] + type SyllablesFlow = Flow[Document, Vector[TapSyllables],NotUsed] + type SpellingFlow = Flow[Document, Vector[TapSpelling],NotUsed] + type PosStatsFlow = Flow[Document, TapPosStats, NotUsed] +} diff --git a/src/main/scala/tap/pipelines/Cleaning.scala b/src/main/scala/tap/pipelines/Cleaning.scala index f063b6f..24b4acd 100644 --- a/src/main/scala/tap/pipelines/Cleaning.scala +++ b/src/main/scala/tap/pipelines/Cleaning.scala @@ -21,7 +21,7 @@ import akka.stream.scaladsl.Flow /***************************************** * Cleaning - * The pipelines for cleaning text + * The pipelines for clean text * Nested object holds pipelines that take a * stream of Char and produce a string stream */ @@ -30,7 +30,7 @@ class Cleaning { /**************************************** * Pipeline * A convenience object that holds the pipelines - * for cleaning + * for clean */ object Pipeline { //Flow[ByteString,String,NotUsed] val revealInvisible:Flow[String,String,NotUsed] = utf8Str via visibleWhitespace via replaceControl diff --git a/src/test/scala/tap/pipelines/AnnotatingPipelineSpec.scala b/src/test/scala/tap/pipelines/AnnotatingPipelineSpec.scala index aadc434..fa4d756 100644 --- a/src/test/scala/tap/pipelines/AnnotatingPipelineSpec.scala +++ b/src/test/scala/tap/pipelines/AnnotatingPipelineSpec.scala @@ -18,14 +18,10 @@ package tap.pipelines import akka.NotUsed import akka.stream.scaladsl.{Flow, Keep, Sink, Source} -import cc.factorie.app.nlp.Document -import io.nlytx.factorie_nlp_api.AnnotatorPipelines -import org.scalatest.AsyncFlatSpec +import io.nlytx.nlp.api.AnnotatorPipelines +import io.nlytx.nlp.api.DocumentModel.Document import org.scalatestplus.play.PlaySpec -import org.scalatestplus.play.guice.{GuiceOneAppPerTest, GuiceOneServerPerTest} import play.api.inject.guice.GuiceApplicationBuilder -import play.api.test.Injecting -import tap.data._ import scala.concurrent.duration._ import scala.concurrent.{Await, Future}