先上Demo:
vl_agent_demo
代码如下:
0 设置工作目次:
你的工作目次需要如下:
此中utils文件夹和qwenvl_agent.py均参考自
GitHub - QwenLM/Qwen2.5-VL: Qwen2.5-VL is the multimodal large language model series developed by Qwen team, Alibaba Cloud.Qwen2.5-VL is the multimodal large language model series developed by Qwen team, Alibaba Cloud. - QwenLM/Qwen2.5-VL https://github.com/QwenLM/Qwen2.5-VL
YourProj(文件夹):
utils(文件夹)
agent_function_call.py
mobile_agent.py
qwenvl_agent.py
(1)运行代码mobile_agent.py:
- import os
- import time
- import json
- from ppadb.client import Client as AdbClient
- import uiautomator2 as u2
- import base64
- from qwenvl_agent import perform_gui_grounding_with_api
- class Android_VL_Agent:
- def __init__(self):
- self.client = AdbClient(host="127.0.0.1", port=5037)
- self.device_serial = None
- self.u2_device = None
- self.SCREENSHOT_PATH = None
- self.QWEN_MODEL_ID = 'qwen2.5-vl-7b-instruct'
- self.__set_up()
- @staticmethod
- def check_adb_service():
- try:
- result = os.popen("adb devices").read()
- if "List of devices attached" in result:
- return True
- else:
- os.system("adb start-server")
- time.sleep(5) # 等待 ADB 服务启动
- result = os.popen("adb devices").read()
- if "List of devices attached" in result:
- return True
- else:
- return False
- except Exception:
- print("ADB服务启动失败")
- return False
- @staticmethod
- def encode_image(image_path):
- with open(image_path, "rb") as image_file:
- return base64.b64encode(image_file.read()).decode("utf-8")
- @staticmethod
- def info_parser(info):
- try:
- body = info.split("<tool_call>")[1].split("</tool_call>")[0]
- return json.loads(body)
- except Exception as e:
- print(f"解析失败: {str(e)}")
- return None
- # 启动
- def __set_up(self):
- assert self.check_adb_service()
- devices = self.client.devices()
- self.device_serial = devices[0].serial if devices else None
- self.u2_device = u2.connect(self.device_serial)
- self.SCREENSHOT_PATH = "screenshot.png"
- # 定义单点事件
- def __single_point_event(self,x,y):
- try:
- self.u2_device.click(x, y)
- return True
- except Exception as e:
- print(f"单点失败: {str(e)}")
- return False
- # 定义输入内容
- def __input_content(self,content):
- try:
- self.u2_device.send_keys(content)
- return True
- except Exception as e:
- print(f"输入失败: {str(e)}")
- return False
- # 截图并保存
- def __screenshot(self):
- try:
- # 清除之前的截图
- if os.path.exists(self.SCREENSHOT_PATH):
- os.remove(self.SCREENSHOT_PATH)
- screenshot = self.u2_device.screenshot()
- screenshot.save(self.SCREENSHOT_PATH)
- # screenshot.show()
- return True
- except Exception as e:
- print(f"截图失败: {str(e)}")
- return False
- def __Qwen_vl_agent(self, query):
- output_info = perform_gui_grounding_with_api(self.SCREENSHOT_PATH, query, self.QWEN_MODEL_ID)
- # print(output_info)
- result = self.info_parser(str(output_info))["arguments"]
- return result
- def __action(self,result):
- if "click" in result["action"]:
- coordinate = result["coordinate"]
- self.__single_point_event(coordinate[0],coordinate[1])
- elif "type" in result["action"]:
- self.__input_content(result["text"])
- def run(self,query):
- # 重新连接
- self.u2_device = u2.connect(self.device_serial)
- # 感知
- self.__screenshot()
- # 理解
- result = self.__Qwen_vl_agent(query)
- print(result)
- # 执行
- self.__action(result)
- def __call__(self,query):
- self.run(query)
- if __name__ == "__main__":
- agent = Android_VL_Agent()
- # timestep
- timestep = 2
- name = "名字"
- message = "信息"
- agent.run("打开微信")
- time.sleep(timestep)
- agent.run(f"点击和{name}聊天框的的顶部区域进入聊天界面")
- time.sleep(timestep)
- agent.run("点击屏幕底部的输入框部分进入输入界面")
- time.sleep(timestep)
- agent.run(f"在聊天框输入内容:{message}")
- time.sleep(timestep)
- agent.run("点击右侧发送按钮中心位置发送消息")
复制代码 (2)方法代码qwenvl_agent.py
- import json
- import base64
- from openai import OpenAI
- from qwen_agent.llm.fncall_prompts.nous_fncall_prompt import (
- NousFnCallPrompt,
- Message,
- ContentItem,
- )
- from PIL import Image, ImageDraw, ImageColor
- from transformers.models.qwen2_vl.image_processing_qwen2_vl_fast import smart_resize
- import warnings
- warnings.filterwarnings("ignore")
- from utils.agent_function_call import ComputerUse
- def draw_point(image: Image.Image, point: list, color=None):
- if isinstance(color, str):
- try:
- color = ImageColor.getrgb(color)
- color = color + (128,)
- except ValueError:
- color = (255, 0, 0, 128)
- else:
- color = (255, 0, 0, 128)
- overlay = Image.new('RGBA', image.size, (255, 255, 255, 0))
- overlay_draw = ImageDraw.Draw(overlay)
- radius = min(image.size) * 0.05
- x, y = point
- overlay_draw.ellipse(
- [(x - radius, y - radius), (x + radius, y + radius)],
- fill=color)
- center_radius = radius * 0.1
- overlay_draw.ellipse(
- [(x - center_radius, y - center_radius),
- (x + center_radius, y + center_radius)],
- fill=(0, 255, 0, 255))
- image = image.convert('RGBA')
- combined = Image.alpha_composite(image, overlay)
- return combined.convert('RGB')
- def encode_image(image_path):
- with open(image_path, "rb") as image_file:
- return base64.b64encode(image_file.read()).decode("utf-8")
- def perform_gui_grounding_with_api(screenshot_path, user_query, model_id, min_pixels=3136, max_pixels=12845056):
- """
- Perform GUI grounding using Qwen model to interpret user query on a screenshot.
- Args:
- screenshot_path (str): Path to the screenshot image
- user_query (str): User's query/instruction
- model: Preloaded Qwen model
- min_pixels: Minimum pixels for the image
- max_pixels: Maximum pixels for the image
- Returns:
- tuple: (output_text, display_image) - Model's output text and annotated image
- """
- # Open and process image
- input_image = Image.open(screenshot_path)
- base64_image = encode_image(screenshot_path)
- client = OpenAI(
- # If the environment variable is not configured, please replace the following line with the Dashscope API Key: api_key="sk-xxx". Access via https://bailian.console.alibabacloud.com/?apiKey=1 "
- api_key="xxx",
- base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
- )
- resized_height, resized_width = smart_resize(
- input_image.height,
- input_image.width,
- min_pixels=min_pixels,
- max_pixels=max_pixels,
- )
- # Initialize computer use function
- computer_use = ComputerUse(
- cfg={"display_width_px": resized_width, "display_height_px": resized_height}
- )
- # Build messages
- system_message = NousFnCallPrompt.preprocess_fncall_messages(
- messages=[
- Message(role="system", content=[ContentItem(text="You are a helpful assistant.")]),
- ],
- functions=[computer_use.function],
- lang=None,
- )
- system_message = system_message[0].model_dump()
- messages = [
- {
- "role": "system",
- "content": [
- {"type": "text", "text": msg["text"]} for msg in system_message["content"]
- ],
- },
- {
- "role": "user",
- "content": [
- {
- "type": "image_url",
- "min_pixels": min_pixels,
- "max_pixels": max_pixels,
- # Pass in BASE64 image data. Note that the image format (i.e., image/{format}) must match the Content Type in the list of supported images. "f" is the method for string formatting.
- # PNG image: f"data:image/png;base64,{base64_image}"
- # JPEG image: f"data:image/jpeg;base64,{base64_image}"
- # WEBP image: f"data:image/webp;base64,{base64_image}"
- "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
- },
- {"type": "text", "text": user_query},
- ],
- }
- ]
- # print(json.dumps(messages, indent=4))
- completion = client.chat.completions.create(
- model=model_id,
- messages=messages,
- )
- output_text = completion.choices[0].message.content
- # Parse action and visualize
- # print(output_text)
- action = json.loads(output_text.split('<tool_call>\n')[1].split('\n</tool_call>')[0])
- # display_image = input_image.resize((resized_width, resized_height))
- # display_image = draw_point(input_image, action['arguments']['coordinate'], color='green')
- return output_text
- if __name__ == "__main__":
- screenshot = "screenshot.png"
- user_query = '在聊天框输入内容:下午好!'
- model_id = "qwen2.5-vl-7b-instruct"
- output_text = perform_gui_grounding_with_api(screenshot, user_query, model_id)
- print(output_text)
复制代码
免责声明:如果侵犯了您的权益,请联系站长,我们会及时删除侵权内容,谢谢合作!更多信息从访问主页:qidao123.com:ToB企服之家,中国第一个企服评测及商务社交产业平台。 |