-
Notifications
You must be signed in to change notification settings - Fork 0
/
openaiwe.py
209 lines (157 loc) · 9.31 KB
/
openaiwe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
import os
from openai import OpenAI
from translation_class import replace_with_quotes_h, replace_with_quotes_hard
def is_sentence_to_translate(sentence):
# if re.search(r'\uFFFC', sentence):
if '<br>' in sentence:
return True
return False
client = OpenAI(
# This is the default and can be omitted
api_key=""
)
prompt1= """
You are a translator of English to Spanish. You will recieve a text in English, sometimes with a term marked between the XML tag <br> and </br>. Then you translate the text to Spanish maintaining the XML tags <br> and </br> between the term. Some examples:
Input: "The University of Florida, in partnership with Motorola, has held two <br>mobile computing</br> design competitions."
Output: "La Universidad de Florida, en asociación con Motorola, ha celebrado dos concursos de diseño de <br>computación móvil</br>."
Input: "Where have all the <br>PC makers</br> gone?".
Output: "¿Dónde se han ido todos los <br>fabricantes de PC</br>?".
Input: "The role of quantum entanglement of the <br>initial state</br> is discussed in detail".
Output: "El papel del enredo cuántico del <br>estado inicial</br> se discute en detalle".
Input: "It often exploits an <br>optical diffusion model-based image reconstruction algorithm</br> to estimate spatial property values from measurements of the light flux at the surface of the tissue."
Output: "A menudo se utiliza un <br>algoritmo de reconstrucción de imágenes basado en un modelo de difusión óptica</br> para estimar los valores de propiedades espaciales a partir de medidas de la flujo de luz en la superficie del tejido."
"""
prompt2= "You are a scientific translator of English to Spanish. Translate the following sentence to Spanish."
prompt25= ("You are a scientific translator of English to Spanish language. Translate the text to Spanish. "
"The text has each sentence in a line. Keep the original line breaks.")
def translate_keyword(key, translated_sentences):
if not key.is_in_text: ## no está anotada, por lo tanto no pertenece al texto
translated_term= gpt_translator(key.key).strip()
key.translated_term= translated_term
return
# Traducir cada frase y reconstruir el texto traducido
translated_text = ""
counter=0
for sentence, t_sentence in zip(key.original_annotated_sentences ,translated_sentences):
if not is_sentence_to_translate(sentence):
translated_text += t_sentence + " "
continue
# Agregar punto al final de la oración para tokenización
#translated_sentence= gpt_translator_key(sentence) #old version
translated_term= gpt_translator_key2(sentence,t_sentence)
translated_sentence = replace_with_quotes_hard(t_sentence, translated_term.strip())
val = is_sentence_to_translate(translated_sentence)
'''
if not val:
print("error")
print(translated_term,translated_sentence)
print(key, sentence)
'''
# Agregar la oración traducida al texto traducido
translated_text += translated_sentence + " "
key.original_annotated_samples.append(sentence)
key.translated_annotated_samples.append(translated_sentence)
if counter == 4:
break
counter = counter + 1
key.translated_annotated_text = translated_text
return
def translate_text_original(sentences):
# Traducir cada frase y reconstruir el texto traducido
text_= ''
for sentence in sentences:
text_ = text_+ sentence +'\n'
translated_sentence = gpt_translator(text_)
translated_text = translated_sentence.split('\n')
if len(sentences)!= len(translated_text):
print(">>>> Fatal error in segmentation")
return translated_text
def translate_text_original2(sentences):
# Traducir cada frase y reconstruir el texto traducido
translated_text = []
for sentence in sentences:
translated_sentence= gpt_translator(sentence)
# Agregar la oración traducida al texto traducido
translated_text.append(translated_sentence)
return translated_text
def gpt_translator(text):
completion = client.chat.completions.create(
model="gpt-3.5-turbo",
temperature=0,
messages=[
{"role": "system", "content": prompt2},
{"role": "user",
"content": text}
]
)
return completion.choices[0].message.content
def gpt_translator_key(text):
completion = client.chat.completions.create(
model="gpt-3.5-turbo",
temperature=0,
messages=[
{"role": "system", "content": prompt1},
{"role": "user", "content": "Input: \" " +text +"\""}
]
)
return completion.choices[0].message.content.replace("\"" ,"")
prompt3= """
You are a scientific translator of English to Spanish specialized in terminology.
I give you one sentence in English and the same sentence translated to Spanish.
The English sentence has a term between the marks <br> and </br>.
Identify in the Spanish sentence which words correspond to the same original term.
The output term is in Spanish.
Some examples
English sentence: "The University of Florida, in partnership with Motorola, has held two <br>mobile computing</br> design competitions".
Spanish sentence : "La Universidad de Florida, en asociación con Motorola, ha celebrado dos concursos de diseño de computación móvil".
Output: computación móvil
English sentence: "There, we assume that <br>coefficients of non-renormalizable terms</br> are suppressed enough to be neglected".
Spanish sentence: "Aquí, asumimos que los coeficientes de los términos no renormalizables están suficientemente suprimidos como para ser ignorados".
Output: coeficientes de los términos no renormalizables
English sentence: "It often exploits an <br>optical diffusion model-based image reconstruction algorithm</br> to estimate spatial property values from measurements of the light flux at the surface of the tissue."
Spanish sentence: "A menudo se utiliza un algoritmo de reconstrucción de imágenes basado en un modelo de difusión óptica para estimar los valores de propiedades espaciales a partir de medidas de la flujo de luz en la superficie del tejido."
Output: algoritmo de reconstrucción de imágenes basado en un modelo de difusión óptica
English: "A second group of experiments is aimed at extensions of the baseline methods that exploit characteristic features of the UvT Expert Collection; specifically, we propose and evaluate refined expert finding and profiling methods that incorporate <br>topicality and organizational structure</br>."
Spanish: "Un segundo grupo de experimentos está dirigido a extensiones de los métodos base que aprovechan las características distintivas de la Colección de Expertos de UvT; específicamente, proponemos y evaluamos métodos refinados de búsqueda y perfilado de expertos que incorporan la topicalidad y la estructura organizativa."
output: topicalidad y la estructura organizativa
"""
"""
"Where have all the <br>PC makers</br> gone?".
Spanish sentence: "¿Dónde se han ido todos los fabricantes de PC?".
Output: fabricantes de PC
"""
def gpt_translator_key2(text1,text2):
completion = client.chat.completions.create(
model="gpt-3.5-turbo",
temperature=0,
messages=[
{"role": "system", "content": prompt3},
{"role": "user", "content": "English: \"" +text1 +"\"\nSpanish: \""+text2+"\"\nOutput: "}
]
)
res= completion.choices[0].message.content.strip()
#print(completion)
return res
#res= translate_text_original(["Hello, world this is a test to follow instructions"])
#print(res)
import re
def replace_with_quotes_hard_gpt(text, term):
escaped_substring = re.escape(term)
# Construct the regex pattern to find the substring
pattern = re.compile('(' + escaped_substring + ')',re.IGNORECASE)
newterm = "<br>" + term + "</br>" # f'"{term}"'
# Use re.sub() to replace the matched substring with annotated version
replaced_text = re.sub(pattern, newterm, text,re.IGNORECASE)
return replaced_text
""" TEST PROMTS
text1= "A second group of experiments is aimed at extensions of the baseline methods that exploit characteristic features of the UvT Expert Collection; specifically, we propose and evaluate refined expert finding and profiling methods that incorporate <br>topicality and organizational structure</br>."
text2="Un segundo grupo de experimentos está dirigido a extensiones de los métodos base que aprovechan las características distintivas de la Colección de Expertos de UvT; específicamente, proponemos y evaluamos métodos refinados de búsqueda y perfilado de expertos que incorporan la topicalidad y la estructura organizativa."
text1 ="<br>broad expertise retrieval</br> in Sparse Data Environments Krisztian Balog ISLA, University of Amsterdam Kruislaan 403, 1098 SJ Amsterdam, The Netherlands kbalog@science.uva.nl Toine Bogers ILK, Tilburg University P.O."
text2 = "Recuperación de amplia experiencia en entornos de datos dispersos Krisztian Balog ISLA, Universidad de Ámsterdam Kruislaan 403, 1098 SJ Ámsterdam, Países Bajos kbalog@science.uva.nl Toine Bogers ILK, Universidad de Tilburg P.O."
res= gpt_translator_key2(text1,text2)
print(res)
translated_sentence = replace_with_quotes_hard_gpt(text2, res.strip())
val = is_sentence_to_translate(translated_sentence)
print(val)
print(translated_sentence)
"""