-
Notifications
You must be signed in to change notification settings - Fork 0
/
openai_vision.py
179 lines (142 loc) · 5.36 KB
/
openai_vision.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
import time
import requests
import base64
from PIL import ImageGrab
import os
import time
from typing import Tuple, List
import os
import base64
import requests
from typing import Tuple
import aiohttp
api_key = os.getenv('OPENAI_API_KEY')
def take_screenshot():
# Ensure the 'images/' directory exists
if not os.path.exists('images'):
os.makedirs('images')
# Generate a unique filename using the current timestamp
filename = f"images/screenshot_{int(time.time())}.png"
# Capture the screenshot
screenshot = ImageGrab.grab()
# Save the screenshot
screenshot.save(filename)
print(f"Screenshot saved as {filename}")
def see_computer_screen() -> Tuple[bool, str]:
"""
Takes a screenshot, analyzes it using the OpenAI GPT-4 vision model, and then deletes the screenshot.
Returns:
Tuple[bool, str]: A tuple containing a boolean and a string. The boolean is True if the analysis was successful,
False otherwise. The string contains the analysis result or an error message.
"""
try:
# Take a screenshot
take_screenshot()
# Find the most recent screenshot in the 'images/' directory
image_path = find_most_recent_image('images')
if not image_path:
return False, "No images found in the 'images/' directory."
# Encode the image
base64_image = encode_image(image_path)
# Set up the request headers and payload
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {api_key}"
}
payload = {
"model": "gpt-4-vision-preview",
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": "What’s in this image?"
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
}
}
]
}
],
"max_tokens": 300
}
# Make the API request
response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
# Delete the screenshot after analysis
os.remove(image_path)
return True, response.json()
except Exception as e:
# Delete the screenshot in case of an error
if image_path:
os.remove(image_path)
return False, f"An error occurred: {e}"
def find_most_recent_image(directory: str) -> str:
"""
Finds the most recent image file in a specified directory.
Args:
directory (str): The directory to search in.
Returns:
str: The path to the most recent image file, or an empty string if no image is found.
"""
try:
# List all files in the directory and filter out non-image files
files = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith(('.png', '.jpg', '.jpeg'))]
# Sort files by modification time
files.sort(key=lambda x: os.path.getmtime(x), reverse=True)
# Return the most recent file
return files[0] if files else ""
except Exception as e:
print(f"Error finding the most recent image: {e}")
return ""
def encode_image(image_path: str) -> str:
"""
Encodes an image to a base64 string.
Args:
image_path (str): The path to the image file.
Returns:
str: The base64 encoded string of the image.
"""
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
async def see_computer_screen_async() -> Tuple[bool, str]:
try:
# Take a screenshot (this part needs to be adapted if it's IO-bound)
take_screenshot()
image_path = find_most_recent_image('images')
if not image_path:
return False, "No images found in the 'images/' directory."
base64_image = encode_image(image_path)
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {api_key}"
}
payload = {
"model": "gpt-4-vision-preview",
"messages": [
{
"role": "user",
"content": [
{"type": "text", "text": "What’s in this image?"},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
]
}
],
"max_tokens": 300
}
# Asynchronous HTTP request
async with aiohttp.ClientSession() as session:
async with session.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload) as response:
response_data = await response.json()
# Extract the description text from the response
description_text = response_data['choices'][0]['message']['content']
# Delete the screenshot after analysis
os.remove(image_path)
return True, description_text
except Exception as e:
if image_path:
os.remove(image_path)
return False, f"An error occurred: {e}"