diff --git a/python/philologic/runtime/reports/collocation.py b/python/philologic/runtime/reports/collocation.py index aaa22168..0abcc0a0 100644 --- a/python/philologic/runtime/reports/collocation.py +++ b/python/philologic/runtime/reports/collocation.py @@ -14,7 +14,7 @@ from orjson import dumps -def collocation_results(request, config): +def collocation_results(request, config, current_collocates): """Fetch collocation results""" collocation_object: dict[str, Any] = {"query": dict([i for i in request])} db = DB(config.db_path + "/data/") @@ -81,7 +81,11 @@ def collocation_results(request, config): max_time = None else: max_time = request.max_time or 2 - all_collocates = {} + + if current_collocates: + all_collocates = dict(current_collocates) + else: + all_collocates = {} start_time = timeit.default_timer() env = lmdb.open( @@ -129,18 +133,17 @@ def collocation_results(request, config): if collocate is not None: # in the event lemma is None if collocate_distance is None: if collocate not in all_collocates: - all_collocates[collocate] = {"count": 1} + all_collocates[collocate] = 1 else: - all_collocates[collocate]["count"] += 1 + all_collocates[collocate] += 1 else: if abs(position - q_word_position[0]) <= collocate_distance: # type: ignore if collocate not in all_collocates: - all_collocates[collocate] = {"count": 1} + all_collocates[collocate] = 1 else: - all_collocates[collocate]["count"] += 1 + all_collocates[collocate] += 1 hits_done += 1 - elapsed = timeit.default_timer() - start_time # split the query if more than request.max_time has been spent in the loop if max_time is not None: @@ -149,6 +152,7 @@ def collocation_results(request, config): env.close() hits.finish() + all_collocates = sorted(all_collocates.items(), key=lambda item: item[1], reverse=True) collocation_object["collocates"] = all_collocates collocation_object["results_length"] = len(hits) if hits_done < collocation_object["results_length"]: @@ -158,10 +162,6 @@ def collocation_results(request, config): collocation_object["more_results"] = False collocation_object["hits_done"] = collocation_object["results_length"] collocation_object["distance"] = collocate_distance - if len(request.metadata) == 1: # request.metadata always has philo_type as a key - collocation_object["whole_corpus"] = True - else: - collocation_object["whole_corpus"] = False return collocation_object diff --git a/www/app/package-lock.json b/www/app/package-lock.json index 7dcc4f90..8f726671 100644 --- a/www/app/package-lock.json +++ b/www/app/package-lock.json @@ -16,6 +16,7 @@ "core-js": "^3.13.1", "glightbox": "^3.2.0", "gsap": "^3.9.1", + "pako": "^2.1.0", "vue": "^3.2.0", "vue-i18n": "^9.2.2", "vue-router": "^4.0.0", @@ -2463,6 +2464,11 @@ "node": ">= 0.8.0" } }, + "node_modules/pako": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/pako/-/pako-2.1.0.tgz", + "integrity": "sha512-w+eufiZ1WuJYgPXbV/PO3NCMEc3xqylkKHzp8bxp1uW4qaSNQUkwmLLEc3kKsfz8lpV1F8Ht3U1Cm+9Srog2ug==" + }, "node_modules/parent-module": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/parent-module/-/parent-module-1.0.1.tgz", @@ -4975,6 +4981,11 @@ "word-wrap": "^1.2.3" } }, + "pako": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/pako/-/pako-2.1.0.tgz", + "integrity": "sha512-w+eufiZ1WuJYgPXbV/PO3NCMEc3xqylkKHzp8bxp1uW4qaSNQUkwmLLEc3kKsfz8lpV1F8Ht3U1Cm+9Srog2ug==" + }, "parent-module": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/parent-module/-/parent-module-1.0.1.tgz", diff --git a/www/app/package.json b/www/app/package.json index bbee5794..a4ee0377 100644 --- a/www/app/package.json +++ b/www/app/package.json @@ -53,4 +53,4 @@ "> 1%", "last 2 versions" ] -} \ No newline at end of file +} diff --git a/www/app/src/components/Collocation.vue b/www/app/src/components/Collocation.vue index 452d7d8f..2dbb9bca 100644 --- a/www/app/src/components/Collocation.vue +++ b/www/app/src/components/Collocation.vue @@ -9,8 +9,8 @@ @click="getFrequency()"> {{ $t("collocation.collocation") }} - @@ -155,7 +155,7 @@ @@ -283,6 +283,7 @@ export default { biblio: {}, moreResults: false, sortedList: [], + collocateCounts: [], showFilteredWords: false, runningTotal: 0, collocCloudWords: [], @@ -303,10 +304,10 @@ export default { otherCollocates: [], otherBiblio: {}, comparedTo: "wholeCorpus", - wholeCorpus: true, filterMetadataOpen: false, compareSearching: false, - comparativeSearchStarted: false + comparativeSearchStarted: false, + otherDone: false, }; }, created() { @@ -332,7 +333,6 @@ export default { methods: { fetchResults() { this.localFormData = this.copyObject(this.$store.state.formData); - var collocObject = {}; this.searching = true; this.relativeFrequencies = {}; this.collocMethod = "frequency" @@ -340,7 +340,7 @@ export default { this.underRepresented = []; this.other_corpus_metadata = {}; this.comparativeSearchStarted = false - this.updateCollocation(collocObject, 0); + this.updateCollocation({}, 0); }, buildMetadata(metadata) { this.metadataDisplay = metadata.display; @@ -357,55 +357,45 @@ export default { start: start.toString(), }; this.$http - .get(`${this.$dbUrl}/reports/collocation.py`, { - params: this.paramsFilter(params), - }) + .post(`${this.$dbUrl}/reports/collocation.py`, { + current_collocates: fullResults, + }, + { + params: this.paramsFilter(params), + }) .then((response) => { - let data = response.data; - this.wholeCorpus = response.data.whole_corpus; - this.resultsLength = data.results_length; - this.moreResults = data.more_results; - this.runningTotal = data.hits_done; - start = data.hits_done; + this.resultsLength = response.data.results_length; + this.moreResults = response.data.more_results; + this.runningTotal = response.data.hits_done; + start = response.data.hits_done; this.searching = false; if (this.resultsLength) { - this.sortAndRenderCollocation(fullResults, data, start); + if (this.moreResults) { + this.sortedList = this.extractSurfaceFromCollocate(response.data.collocates.slice(0, 100)); + this.updateCollocation(response.data.collocates, start); + } + else { + this.collocateCounts = response.data.collocates; + this.sortedList = this.extractSurfaceFromCollocate(response.data.collocates.slice(0, 100)); + this.done = true + } } + }) .catch((error) => { this.searching = false; this.debug(this, error); }); }, - sortAndRenderCollocation(fullResults, data, start) { - if (typeof fullResults === "undefined" || Object.keys(fullResults).length === 0) { - fullResults = {}; - this.filterList = data.filter_list; - } - var collocates = this.mergeResults(fullResults, data.collocates); - this.collocatesUnsorted = collocates.unsorted - this.sortedList = this.extractSurfaceFromCollocate(collocates.sorted.slice(0, 100)); - // this.buildWordCloud(); - if (this.moreResults) { - var tempFullResults = collocates.unsorted; - var runningQuery = this.$store.state.formData; - if (this.report === "collocation" && this.deepEqual(runningQuery, this.localFormData)) { - // make sure we're still running the same query - this.updateCollocation(tempFullResults, start); - } - } else { - this.done = true; - } - }, extractSurfaceFromCollocate(words) { let newWords = [] - for (let word of words) { - let collocate = `${word.label}`.replace(/lemma:/, ""); + for (let wordObj of words) { + let collocate = `${wordObj[0]}`.replace(/lemma:/, ""); if (collocate.search(/\w+:.*/) != -1) { collocate = collocate.replace(/(\p{L}+):.*/u, "$1"); } - let surfaceForm = word.label; - newWords.push({ collocate: collocate, surfaceForm: surfaceForm, count: word.count }); + let surfaceForm = wordObj[0]; + newWords.push({ collocate: collocate, surfaceForm: surfaceForm, count: wordObj[1] }); } return newWords }, @@ -469,14 +459,54 @@ export default { }) }) }, - comparativeCollocations(method) { + getOtherCollocates(fullResults, start) { + // Check if this.compareMetadataValues is empty + if (Object.keys(this.comparedMetadataValues).length === 0) { + this.wholeCorpus = true + } else { + this.wholeCorpus = false + } + this.collocMethod = 'compare'; + let params = { + q: this.q, + ...this.comparedMetadataValues, + start: start.toString(), + }; + this.otherDone = false; + this.$http + .post(`${this.$dbUrl}/reports/collocation.py`, { + current_collocates: fullResults, + }, + { + params: this.paramsFilter(params), + }) + .then((response) => { + let resultsLength = response.data.results_length; + let moreResults = response.data.more_results; + let start = response.data.hits_done; + if (resultsLength) { + if (moreResults) { + this.getOtherCollocates(response.data.collocates, start); + } + else { + this.otherDone = true + this.comparativeCollocations(response.data.collocates) + } + } + + }) + .catch((error) => { + this.searching = false; + this.debug(this, error); + }); + }, + comparativeCollocations(otherCollocates) { let collapseElement = document.getElementById('other-corpus-metadata') if (collapseElement != null) { Collapse.getInstance(collapseElement).hide() this.filterMetadataOpen = false } this.comparativeSearchStarted = true; - this.collocMethod = method; this.comparedMetadataValues = this.dateRangeHandler(this.metadataInputStyle, this.dateRange, this.dateType, this.comparedMetadataValues) this.otherBiblio = this.buildBiblioCriteria(this.$philoConfig, this.comparedMetadataValues, this.comparedMetadataValues) this.compareSearching = true; @@ -484,21 +514,17 @@ export default { this.underRepresented = []; this.otherCollocates = []; this.$http.post(`${this.$dbUrl}/scripts/comparative_collocations.py`, { - all_collocates: this.collocatesUnsorted, - other_corpus_metadata: this.comparedMetadataValues - }, { - params: { - ...this.$store.state.formData, - }, - + all_collocates: this.collocateCounts, + other_collocates: otherCollocates, + whole_corpus: this.wholeCorpus, }, { headers: { 'Content-Type': 'application/x-www-form-urlencoded' } }).then((response) => { + this.otherCollocates = this.extractSurfaceFromCollocate(otherCollocates.slice(0, 100)); this.overRepresented = this.extractSurfaceFromCollocate(response.data.top); this.underRepresented = this.extractSurfaceFromCollocate(response.data.bottom); - this.otherCollocates = this.extractSurfaceFromCollocate(response.data.other_collocates); this.relativeFrequencies = { top: this.overRepresented, bottom: this.underRepresented }; this.compareSearching = false; diff --git a/www/app/src/locales/en.json b/www/app/src/locales/en.json index 05acf6f2..0e894e7b 100644 --- a/www/app/src/locales/en.json +++ b/www/app/src/locales/en.json @@ -63,7 +63,7 @@ "filterCollocate": "Filter collocates by", "collocatesWithin": "Collocates within", "words": "words", - "collocatesWithinSentence": "Collocates within the same sentence", + "collocatesWithinSentence": "Collocates within the same sentence" }, "resultsBiblio": { "heading": "Bibliography of results on this page", diff --git a/www/app/src/locales/fr.json b/www/app/src/locales/fr.json index 2f9763cd..26aed451 100644 --- a/www/app/src/locales/fr.json +++ b/www/app/src/locales/fr.json @@ -62,7 +62,7 @@ "filterCollocate": "Filtrer les collocations par", "collocatesWithin": "Collocations dans une fenêtre de", "words": "mots", - "collocatesWithinSentence": "Collocations dans la même phrase", + "collocatesWithinSentence": "Collocations dans la même phrase" }, "resultsBiblio": { "heading": "Bibliographie des résultats sur cette page", diff --git a/www/reports/collocation.py b/www/reports/collocation.py index dbb54577..b1d28049 100755 --- a/www/reports/collocation.py +++ b/www/reports/collocation.py @@ -27,9 +27,23 @@ def collocation(environ, start_response): config = WebConfig(os.path.abspath(os.path.dirname(__file__)).replace("reports", "")) request = WSGIHandler(environ, config) + if environ["REQUEST_METHOD"] == "OPTIONS": + # Handle preflight request + start_response( + "200 OK", + [ + ("Content-Type", "text/plain"), + ("Access-Control-Allow-Origin", environ["HTTP_ORIGIN"]), # Replace with your client domain + ("Access-Control-Allow-Methods", "POST, OPTIONS"), + ("Access-Control-Allow-Headers", "Content-Type"), # Adjust if needed for your headers + ], + ) + return [b""] # Empty response body for OPTIONS headers = [("Content-type", "application/json; charset=UTF-8"), ("Access-Control-Allow-Origin", "*")] start_response("200 OK", headers) - collocation_object = collocation_results(request, config) + post_data = environ["wsgi.input"].read() + current_collocates = orjson.loads(post_data)["current_collocates"] + collocation_object = collocation_results(request, config, current_collocates) yield orjson.dumps(collocation_object) diff --git a/www/scripts/comparative_collocations.py b/www/scripts/comparative_collocations.py index d5e5f899..2b4d2daf 100755 --- a/www/scripts/comparative_collocations.py +++ b/www/scripts/comparative_collocations.py @@ -42,22 +42,11 @@ def get_collocation_relative_proportions(environ, start_response): status = "200 OK" headers = [("Content-type", "application/json; charset=UTF-8"), ("Access-Control-Allow-Origin", "*")] start_response(status, headers) - config = WebConfig(os.path.abspath(os.path.dirname(__file__)).replace("scripts", "")) - request = WSGIHandler(environ, config) - - post_data = environ["wsgi.input"].read() - all_collocates = orjson.loads(post_data)["all_collocates"] - other_corpus_metadata = orjson.loads(post_data)["other_corpus_metadata"] - - whole_corpus = True - if other_corpus_metadata: - request.metadata = other_corpus_metadata - whole_corpus = False - else: - # Run collocation against whole corpus - request.metadata = {} # Clear metadata to run against whole corpus - request.max_time = None # fetch all results - other_collocates = collocation_results(request, config)["collocates"] + + post_data = orjson.loads(environ["wsgi.input"].read()) + all_collocates = post_data["all_collocates"] + other_collocates = post_data["other_collocates"] + whole_corpus = post_data["whole_corpus"] top_relative_proportions, low_relative_proportions = get_relative_proportions( all_collocates, other_collocates, whole_corpus @@ -68,8 +57,7 @@ def get_collocation_relative_proportions(environ, start_response): "top": top_relative_proportions, "bottom": low_relative_proportions, "other_collocates": [ - {"label": word, "count": value["count"]} - for word, value in sorted(other_collocates.items(), key=lambda x: x[1]["count"], reverse=True)[:100] + (word, value) for word, value in sorted(other_collocates, key=lambda x: x[1], reverse=True)[:100] ], } ) @@ -77,18 +65,12 @@ def get_collocation_relative_proportions(environ, start_response): def get_relative_proportions(all_collocates, other_collocates, whole_corpus): # Create DataFrames - df_sub = pd.DataFrame.from_dict( - {k: v["count"] for k, v in all_collocates.items()}, orient="index", columns=["sub_corpus_count"] - ) - df_other = pd.DataFrame.from_dict( - {k: v["count"] for k, v in other_collocates.items()}, orient="index", columns=["other_corpus_count"] - ) + df_sub = pd.DataFrame.from_dict(dict(all_collocates), orient="index", columns=["sub_corpus_count"]) + df_other = pd.DataFrame.from_dict(dict(other_collocates), orient="index", columns=["other_corpus_count"]) # Outer Join (Preserves all collocates) df_combined = df_sub.join(df_other, how="outer").fillna(0) - df_combined.to_csv("/tmp/combined.csv") - # Adjust counts if comparing against the whole corpus if whole_corpus: df_combined["other_corpus_count"] = df_combined["other_corpus_count"] - df_combined["sub_corpus_count"] @@ -112,8 +94,10 @@ def get_relative_proportions(all_collocates, other_collocates, whole_corpus): # Over-representation score df_combined["over_representation_score"] = df_combined["sub_corpus_zscore"] - df_combined["other_corpus_zscore"] + df_combined.to_csv("/tmp/combined.csv") + top_relative_proportions = [ - {"label": word, "count": value} + (word, value) for word, value in df_combined[df_combined["over_representation_score"] > 0]["over_representation_score"] .sort_values(ascending=False) .head(100) @@ -121,7 +105,7 @@ def get_relative_proportions(all_collocates, other_collocates, whole_corpus): ] bottom_relative_proportions = [ - {"label": word, "count": abs(value)} + (word, abs(value)) for word, value in df_combined[df_combined["over_representation_score"] < 0]["over_representation_score"] .sort_values() .head(100)