-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
198 lines (148 loc) · 7.14 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
from transformers import AutoTokenizer, AutoModelForCausalLM
import re
import pandas as pd
from search import Search
from googletrans import Translator
def load_model(model_name, device_map = {"": "cuda:1"}, load_in_4bit = True):
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name,
device_map=device_map,
load_in_4bit=True
)
return tokenizer, model
# def generate(chat, model, tokenizer, max_new_tokens = 1000, num_return_sequences = 1):
# """
# This function takes as an input the logs of a chat along with a model and a tokenizer and generates a list of repsonses (or response) of the LM
# """
# input_ids = tokenizer.apply_chat_template(chat, tokenize=True, return_tensors="pt").to("cuda")
# outputs = model.generate(input_ids, max_new_tokens=max_new_tokens, num_return_sequences=num_return_sequences, do_sample=False) #top_p=0.92, top_k=500)
# return [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
# def decode_chat_output(output):
# return output.split("[/INST]")[-1].strip()
# def generate_thoughts(chat, model, tokenizer, num_of_thoughts = 5):
# """
# Sample num_of_thoughts different responses
# """
# # outputs = generate(chat, model, tokenizer, max_new_tokens = 250, num_return_sequences = 3)
# responses = []
# for i in range (num_of_thoughts):
# output = generate(chat, model, tokenizer)
# responses.append(decode_chat_output(output))
# return responses
# class Chat:
# def __init__(self, model, tokenizer, system_message):
# self.model = model
# self.tokenizer = tokenizer
# self.chat = [{"role": "system", "content": system_message}]
# def add_users_input(self, users_input):
# self.chat.append({"role": "user", "content": users_input})
# def answer(self, max_new_tokens = 250):
# output = generate(self.chat, self.model, self.tokenizer, max_new_tokens=max_new_tokens)[0]
# output = decode_chat_output(output)
# self.chat.append({"role": "assistant", "content": output})
# return output
# def create_occupation_promp(text):
# prompt = f"""In the following text identify the occupations and provide the occupation title along with a small definition of this occupation, in the following format:
# Occupation title: [Occupation title as it is referred in the text]
# Definition: [Definition]
# {text}
# """
# # prompt = f"""This is a conversation between User and a friendly chatbot. This chatbot is helpful, kind, honest, good at writing, and never fails to answer any requests immediately and with precision.
# # User:
# # In the following text, identify if there are occupation titles explicitly stated. If there are not respond with:
# # Zero occupations are explicitly stated in this text.
# # Otherwise provide the occupation title along with a brief definition in the following format:
# # Occupation title: [Occupation title as it is referred to in the text]
# # Definition: [Definition]
# # Input:
# # {text}
# # """
# # prompt = f"""In the following text, identify the occupation titles that are explicitly stated and provide the occupation title along with a brief definition in the following format:
# # Occupation title: [Occupation title as it is referred to in the text]
# # Definition: [Definition]
# # If no occupation is identified, please respond with:
# # "No occupation found."
# # Here are some examples:
# # Input:
# # The doctor consulted with the nurse to determine the best treatment plan for the patient.
# # Occupation title: Doctor
# # Definition: A licensed medical professional who diagnoses and treats illnesses and injuries.
# # Occupation title: Nurse
# # Definition: A healthcare professional who cares for patients, administers medication, and supports doctors in the provision of healthcare services.
# # Input:
# # They worked together to ensure the best possible care for their patient.
# # No occupation found.
# # Input:
# # {text}
# """
return prompt
def extract_occupations_from_resp(text):
"""
Extracts occupation titles and descriptions from text using regular expressions.
Args:
text: The text to extract occupations from.
Returns:
A list of dictionaries, where each dictionary represents an occupation with keys
'title' and 'definition'.
"""
text = text.lower()
occupations = []
pattern = r"occupation title: (.*?)\ndefinition:(.*?)(?=\noccupation title:|$)"
# pattern = r"occupation title: (.*?)\ndefinition:(.*?)(?=\n\d+\. occupation title:|$)"
matches = re.findall(pattern, text, re.DOTALL)
for title, description in matches:
title = re.sub(r'\([^)]*\)', '', title).strip()
occupations.append({
"title": title.strip(),
"definition": description.strip()
})
return occupations
def extract_occupations_from_resp_el(text, translator):
"""
Extracts occupation titles and descriptions from text using regular expressions.
Args:
text: The text to extract occupations from.
Returns:
A list of dictionaries, where each dictionary represents an occupation with keys
'title' and 'definition'.
"""
text = text.lower()
occupations = []
pattern = r"τίτλος επαγγέλματος: (.*?)\nορισμός:(.*?)(?=\nΤίτλος επαγγέλματος:|$)"
matches = re.findall(pattern, text, re.DOTALL)
for title, description in matches:
title = re.sub(r'\([^)]*\)', '', title).strip()
occupations.append({
"title": title.strip(),
"definition": translator.translate(description.strip(), "el", "en")
})
return occupations
class GoogleTranslate:
def __init__(self):
self.translator = Translator(service_urls=['translate.googleapis.com'])
def translate(self, source_text, source_lang, target_lang):
return self.translator.translate(source_text, src= source_lang, dest=target_lang).text
def translate_batch(self, source_texts, source_lang, target_lang):
return [self.translator.translate(source_text, src= source_lang, dest=target_lang).text for source_text in source_texts]
class Knowledge:
def __init__(self, filename, column):
self.occ_defs = pd.read_csv(filename, sep =";", dtype={'ISCO 08 Code': str})
texts = self.occ_defs[column].tolist()
self.s = Search(texts)
def connect(self, descr, top_k = 5):
return self.s.search(descr)[:top_k]
def describe_occ(self, index):
row = self.occ_defs.iloc[index]
return f"""Level: {row['Level']}
ISCO 08 Code: {row['ISCO 08 Code']}
Title EN: {row['Title EN']}
Definition: {row['Definition']}
"""
def describe_occ_dict(self, index):
row = self.occ_defs.iloc[index]
return {
"level": row['Level'],
"title EN": row['Title EN'],
"definition": row['Definition'],
"ISCO 08 Code": row['ISCO 08 Code']
}