From df7776e3408022cb3f01732172d55ad5650bd38d Mon Sep 17 00:00:00 2001 From: Kalyan Chakravarthy Date: Tue, 3 Sep 2024 11:37:20 +0530 Subject: [PATCH 1/4] implemented: text-classification support for multi-label classification. --- langtest/datahandler/datasource.py | 2 ++ langtest/modelhandler/jsl_modelhandler.py | 38 +++++++++++++++++++---- langtest/tasks/task.py | 18 +++++++++-- langtest/utils/custom_types/output.py | 22 +++++++++---- 4 files changed, 66 insertions(+), 14 deletions(-) diff --git a/langtest/datahandler/datasource.py b/langtest/datahandler/datasource.py index 868a4152a..90e3abcb8 100644 --- a/langtest/datahandler/datasource.py +++ b/langtest/datahandler/datasource.py @@ -957,6 +957,8 @@ def _import_data(self, file_name, **kwargs) -> List[Sample]: import ast i["transformations"] = ast.literal_eval(temp) + else: + i["transformations"] = None sample = self.task.get_sample_class(**i) samples.append(sample) diff --git a/langtest/modelhandler/jsl_modelhandler.py b/langtest/modelhandler/jsl_modelhandler.py index f13b18d32..9bb67fcc2 100644 --- a/langtest/modelhandler/jsl_modelhandler.py +++ b/langtest/modelhandler/jsl_modelhandler.py @@ -42,6 +42,7 @@ XlmRoBertaForSequenceClassification, XlnetForSequenceClassification, MarianTransformer, + MultiClassifierDLModel, ) from sparknlp.base import LightPipeline from sparknlp.pretrained import PretrainedPipeline @@ -63,6 +64,7 @@ SUPPORTED_SPARKNLP_CLASSIFERS.extend( [ + MultiClassifierDLModel, ClassifierDLModel, SentimentDLModel, AlbertForSequenceClassification, @@ -409,6 +411,7 @@ def __init__( super().__init__(model) _classifier = None + self.multi_label_classifier = False for annotator in self.model.stages: if self.is_classifier(annotator): _classifier = annotator @@ -417,6 +420,10 @@ def __init__( if _classifier is None: raise ValueError(Errors.E040(var="classifier")) + if isinstance(_classifier, MultiClassifierDLModel): + self.multi_label_classifier = True + self.threshold = _classifier.getThreshold() + self.output_col = _classifier.getOutputCol() self.classes = _classifier.getClasses() self.model = LightPipeline(self.model) @@ -442,13 +449,32 @@ def predict( Returns: SequenceClassificationOutput: Classification output from SparkNLP LightPipeline. """ - prediction_metadata = self.model.fullAnnotate(text)[0][self.output_col][ - 0 - ].metadata - prediction = [{"label": x, "score": y} for x, y in prediction_metadata.items()] + prediction_metadata = self.model.fullAnnotate(text)[0][self.output_col] + + if self.multi_label_classifier: + multi_label = True + if len(prediction_metadata) > 0: + prediction_metadata = prediction_metadata[0].metadata + + prediction = [ + {"label": x, "score": y} for x, y in prediction_metadata.items() + ] + # filter based on the threshold value with score greater than threshold + prediction = [x for x in prediction if float(x["score"]) > self.threshold] + + return SequenceClassificationOutput( + text=text, + predictions=prediction, + multi_label=multi_label, + ) + else: + return SequenceClassificationOutput( + text=text, predictions=[], multi_label=multi_label + ) - if not return_all_scores: - prediction = [max(prediction, key=lambda x: x["score"])] + else: + if not return_all_scores: + prediction = [max(prediction, key=lambda x: x["score"])] return SequenceClassificationOutput(text=text, predictions=prediction) diff --git a/langtest/tasks/task.py b/langtest/tasks/task.py index 035725bb8..93af99114 100644 --- a/langtest/tasks/task.py +++ b/langtest/tasks/task.py @@ -1,3 +1,4 @@ +import ast import re from abc import ABC, abstractmethod from typing import Union @@ -267,17 +268,28 @@ def create_sample( row_data: dict, feature_column="text", target_column: Union[samples.SequenceLabel, str] = "label", + multi_label: bool = False, + *args, + **kwargs, ) -> samples.SequenceClassificationSample: """Create a sample.""" keys = list(row_data.keys()) # auto-detect the default column names from the row_data column_mapper = cls.column_mapping(keys, [feature_column, target_column]) + # is multi-label classification + # if "multi_label" in kwargs: + # multi_label = kwargs.get("multi_label", False) + # kwargs.pop("multi_label") + labels = row_data.get(column_mapper[target_column]) if isinstance(labels, samples.SequenceLabel): labels = [labels] - elif isinstance(labels, list): + elif isinstance(labels, list) or isinstance(labels, str): + labels = ast.literal_eval(labels) + if not isinstance(labels, list): + labels = [labels] labels = [ samples.SequenceLabel(label=label, score=1.0) if isinstance(label, str) @@ -289,7 +301,9 @@ def create_sample( return samples.SequenceClassificationSample( original=row_data[column_mapper[feature_column]], - expected_results=samples.SequenceClassificationOutput(predictions=labels), + expected_results=samples.SequenceClassificationOutput( + predictions=labels, multi_label=multi_label + ), ) diff --git a/langtest/utils/custom_types/output.py b/langtest/utils/custom_types/output.py index bcd1e4cf0..6961e4b0f 100644 --- a/langtest/utils/custom_types/output.py +++ b/langtest/utils/custom_types/output.py @@ -8,6 +8,7 @@ class SequenceClassificationOutput(BaseModel): """Output model for text classification tasks.""" predictions: List[SequenceLabel] + multi_label: bool = False def to_str_list(self) -> str: """Convert the output into list of strings. @@ -15,18 +16,27 @@ def to_str_list(self) -> str: Returns: List[str]: predictions in form of a list of strings. """ - return ",".join([x.label for x in self.predictions]) + return ", ".join([x.label for x in self.predictions]) - def __str__(self): + def __str__(self) -> str: """String representation""" labels = {elt.label: elt.score for elt in self.predictions} return f"SequenceClassificationOutput(predictions={labels})" - def __eq__(self, other): + def __eq__(self, other: "SequenceClassificationOutput") -> bool: """Equality comparison method.""" - top_class = max(self.predictions, key=lambda x: x.score).label - other_top_class = max(other.predictions, key=lambda x: x.score).label - return top_class == other_top_class + + if self.multi_label: + # get all labels + self_labels = {elt.label for elt in self.predictions} + other_labels = {elt.label for elt in other.predictions} + return set(self_labels) == set(other_labels) + elif len(self.predictions) == 0 and len(other.predictions) == 0: + return True + else: + top_class = max(self.predictions, key=lambda x: x.score).label + other_top_class = max(other.predictions, key=lambda x: x.score).label + return top_class == other_top_class class MinScoreOutput(BaseModel): From 2da96b76e9d52c6e75c6c5ed2b2fe41cd9735774 Mon Sep 17 00:00:00 2001 From: Kalyan Chakravarthy Date: Tue, 3 Sep 2024 12:17:33 +0530 Subject: [PATCH 2/4] Refactor SequenceClassificationOutputFormatter to handle multi-label predictions --- langtest/datahandler/format.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/langtest/datahandler/format.py b/langtest/datahandler/format.py index 0755108f0..621fe34e0 100644 --- a/langtest/datahandler/format.py +++ b/langtest/datahandler/format.py @@ -108,9 +108,18 @@ def to_csv(sample: SequenceClassificationSample) -> Tuple[str, str]: Tuple[str, str]: Row formatted as a list of strings. """ - if sample.test_case: - return [sample.test_case, sample.expected_results.predictions[0].label] - return [sample.original, sample.expected_results.predictions[0].label] + predictions = sample.expected_results.predictions + multi_label = sample.expected_results.multi_label + + if multi_label: + return [ + sample.test_case or sample.original, + [elt.label for elt in predictions] if predictions else [], + ] + else: + if sample.test_case: + return [sample.test_case, sample.expected_results.predictions[0].label] + return [sample.original, sample.expected_results.predictions[0].label] class NEROutputFormatter(BaseFormatter): From 16fee46de37e09d003b170aa85c902428ccd5902 Mon Sep 17 00:00:00 2001 From: Kalyan Chakravarthy Date: Tue, 3 Sep 2024 12:35:54 +0530 Subject: [PATCH 3/4] Refactor CSVDataset to remove unnecessary transformation field --- langtest/datahandler/datasource.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/langtest/datahandler/datasource.py b/langtest/datahandler/datasource.py index 90e3abcb8..42648338d 100644 --- a/langtest/datahandler/datasource.py +++ b/langtest/datahandler/datasource.py @@ -958,7 +958,7 @@ def _import_data(self, file_name, **kwargs) -> List[Sample]: i["transformations"] = ast.literal_eval(temp) else: - i["transformations"] = None + i.pop("transformations") sample = self.task.get_sample_class(**i) samples.append(sample) From 258a0f7ddeb9b05ca5a99c8a196febbc1dabcbd1 Mon Sep 17 00:00:00 2001 From: Kalyan Chakravarthy Date: Tue, 3 Sep 2024 14:19:05 +0530 Subject: [PATCH 4/4] fixed: Unbound Error and Key Error. --- langtest/datahandler/datasource.py | 2 +- langtest/modelhandler/jsl_modelhandler.py | 38 ++++++++++------------- 2 files changed, 17 insertions(+), 23 deletions(-) diff --git a/langtest/datahandler/datasource.py b/langtest/datahandler/datasource.py index 42648338d..1d89303ae 100644 --- a/langtest/datahandler/datasource.py +++ b/langtest/datahandler/datasource.py @@ -957,7 +957,7 @@ def _import_data(self, file_name, **kwargs) -> List[Sample]: import ast i["transformations"] = ast.literal_eval(temp) - else: + elif "transformations" in i: i.pop("transformations") sample = self.task.get_sample_class(**i) samples.append(sample) diff --git a/langtest/modelhandler/jsl_modelhandler.py b/langtest/modelhandler/jsl_modelhandler.py index 9bb67fcc2..0b703d637 100644 --- a/langtest/modelhandler/jsl_modelhandler.py +++ b/langtest/modelhandler/jsl_modelhandler.py @@ -450,31 +450,25 @@ def predict( SequenceClassificationOutput: Classification output from SparkNLP LightPipeline. """ prediction_metadata = self.model.fullAnnotate(text)[0][self.output_col] + prediction = [] + + if len(prediction_metadata) > 0: + prediction_metadata = prediction_metadata[0].metadata + prediction = [ + {"label": x, "score": y} for x, y in prediction_metadata.items() + ] if self.multi_label_classifier: - multi_label = True - if len(prediction_metadata) > 0: - prediction_metadata = prediction_metadata[0].metadata - - prediction = [ - {"label": x, "score": y} for x, y in prediction_metadata.items() - ] - # filter based on the threshold value with score greater than threshold - prediction = [x for x in prediction if float(x["score"]) > self.threshold] - - return SequenceClassificationOutput( - text=text, - predictions=prediction, - multi_label=multi_label, - ) - else: - return SequenceClassificationOutput( - text=text, predictions=[], multi_label=multi_label - ) + prediction = [x for x in prediction if float(x["score"]) > self.threshold] - else: - if not return_all_scores: - prediction = [max(prediction, key=lambda x: x["score"])] + return SequenceClassificationOutput( + text=text, + predictions=prediction, + multi_label=self.multi_label_classifier, + ) + + if not return_all_scores: + prediction = [max(prediction, key=lambda x: x["score"])] return SequenceClassificationOutput(text=text, predictions=prediction)