基于Qwen-VL的手机智能体开辟

打印 上一主题 下一主题

主题 994|帖子 994|积分 2982

先上Demo:
     vl_agent_demo
  
代码如下:
0 设置工作目次:
    你的工作目次需要如下:
   此中utils文件夹和qwenvl_agent.py均参考自
  GitHub - QwenLM/Qwen2.5-VL: Qwen2.5-VL is the multimodal large language model series developed by Qwen team, Alibaba Cloud.Qwen2.5-VL is the multimodal large language model series developed by Qwen team, Alibaba Cloud. - QwenLM/Qwen2.5-VL
https://github.com/QwenLM/Qwen2.5-VL
   YourProj(文件夹):
                  utils(文件夹)
                        agent_function_call.py
                  mobile_agent.py
                  qwenvl_agent.py
  (1)运行代码mobile_agent.py:
  1. import os
  2. import time
  3. import json
  4. from ppadb.client import Client as AdbClient
  5. import uiautomator2 as u2
  6. import base64
  7. from qwenvl_agent import perform_gui_grounding_with_api
  8. class Android_VL_Agent:
  9.     def __init__(self):
  10.         self.client = AdbClient(host="127.0.0.1", port=5037)
  11.         self.device_serial = None
  12.         self.u2_device = None
  13.         self.SCREENSHOT_PATH = None
  14.         self.QWEN_MODEL_ID = 'qwen2.5-vl-7b-instruct'
  15.         self.__set_up()
  16.     @staticmethod
  17.     def check_adb_service():
  18.         try:
  19.             result = os.popen("adb devices").read()
  20.             if "List of devices attached" in result:
  21.                 return True
  22.             else:
  23.                 os.system("adb start-server")
  24.                 time.sleep(5)  # 等待 ADB 服务启动
  25.                 result = os.popen("adb devices").read()
  26.                 if "List of devices attached" in result:
  27.                     return True
  28.                 else:
  29.                     return False
  30.         except Exception:
  31.             print("ADB服务启动失败")
  32.             return False
  33.     @staticmethod
  34.     def encode_image(image_path):
  35.         with open(image_path, "rb") as image_file:
  36.             return base64.b64encode(image_file.read()).decode("utf-8")
  37.     @staticmethod
  38.     def info_parser(info):
  39.         try:
  40.             body = info.split("<tool_call>")[1].split("</tool_call>")[0]
  41.             return json.loads(body)
  42.         except Exception as e:
  43.             print(f"解析失败: {str(e)}")
  44.             return None
  45.     # 启动
  46.     def __set_up(self):
  47.         assert self.check_adb_service()
  48.         devices = self.client.devices()
  49.         self.device_serial = devices[0].serial if devices else None
  50.         self.u2_device = u2.connect(self.device_serial)
  51.         self.SCREENSHOT_PATH = "screenshot.png"
  52.     # 定义单点事件
  53.     def __single_point_event(self,x,y):
  54.         try:
  55.             self.u2_device.click(x, y)
  56.             return True
  57.         except Exception as e:
  58.             print(f"单点失败: {str(e)}")
  59.             return False
  60.     # 定义输入内容
  61.     def __input_content(self,content):
  62.         try:
  63.             self.u2_device.send_keys(content)
  64.             return True
  65.         except Exception as e:
  66.             print(f"输入失败: {str(e)}")
  67.             return False
  68.     # 截图并保存
  69.     def __screenshot(self):
  70.         try:
  71.             # 清除之前的截图
  72.             if os.path.exists(self.SCREENSHOT_PATH):
  73.                 os.remove(self.SCREENSHOT_PATH)
  74.             screenshot = self.u2_device.screenshot()
  75.             screenshot.save(self.SCREENSHOT_PATH)
  76.             # screenshot.show()
  77.             return True
  78.         except Exception as e:
  79.             print(f"截图失败: {str(e)}")
  80.             return False
  81.     def __Qwen_vl_agent(self, query):
  82.         output_info = perform_gui_grounding_with_api(self.SCREENSHOT_PATH, query, self.QWEN_MODEL_ID)
  83.         # print(output_info)
  84.         result = self.info_parser(str(output_info))["arguments"]
  85.         return result
  86.     def __action(self,result):
  87.         if "click" in result["action"]:
  88.             coordinate = result["coordinate"]
  89.             self.__single_point_event(coordinate[0],coordinate[1])
  90.         elif "type" in result["action"]:
  91.             self.__input_content(result["text"])
  92.     def run(self,query):
  93.         # 重新连接
  94.         self.u2_device = u2.connect(self.device_serial)
  95.         # 感知
  96.         self.__screenshot()
  97.         # 理解
  98.         result = self.__Qwen_vl_agent(query)
  99.         print(result)
  100.         # 执行
  101.         self.__action(result)
  102.     def __call__(self,query):
  103.         self.run(query)
  104. if __name__ == "__main__":
  105.     agent = Android_VL_Agent()
  106.     # timestep
  107.     timestep = 2
  108.     name = "名字"
  109.     message = "信息"
  110.     agent.run("打开微信")
  111.     time.sleep(timestep)
  112.     agent.run(f"点击和{name}聊天框的的顶部区域进入聊天界面")
  113.     time.sleep(timestep)
  114.     agent.run("点击屏幕底部的输入框部分进入输入界面")
  115.     time.sleep(timestep)
  116.     agent.run(f"在聊天框输入内容:{message}")
  117.     time.sleep(timestep)
  118.     agent.run("点击右侧发送按钮中心位置发送消息")
复制代码
(2)方法代码qwenvl_agent.py
  1. import json
  2. import base64
  3. from openai import OpenAI
  4. from qwen_agent.llm.fncall_prompts.nous_fncall_prompt import (
  5.     NousFnCallPrompt,
  6.     Message,
  7.     ContentItem,
  8. )
  9. from PIL import Image, ImageDraw, ImageColor
  10. from transformers.models.qwen2_vl.image_processing_qwen2_vl_fast import smart_resize
  11. import warnings
  12. warnings.filterwarnings("ignore")
  13. from utils.agent_function_call import ComputerUse
  14. def draw_point(image: Image.Image, point: list, color=None):
  15.     if isinstance(color, str):
  16.         try:
  17.             color = ImageColor.getrgb(color)
  18.             color = color + (128,)
  19.         except ValueError:
  20.             color = (255, 0, 0, 128)
  21.     else:
  22.         color = (255, 0, 0, 128)
  23.     overlay = Image.new('RGBA', image.size, (255, 255, 255, 0))
  24.     overlay_draw = ImageDraw.Draw(overlay)
  25.     radius = min(image.size) * 0.05
  26.     x, y = point
  27.     overlay_draw.ellipse(
  28.         [(x - radius, y - radius), (x + radius, y + radius)],
  29.         fill=color)
  30.     center_radius = radius * 0.1
  31.     overlay_draw.ellipse(
  32.         [(x - center_radius, y - center_radius),
  33.          (x + center_radius, y + center_radius)],
  34.         fill=(0, 255, 0, 255))
  35.     image = image.convert('RGBA')
  36.     combined = Image.alpha_composite(image, overlay)
  37.     return combined.convert('RGB')
  38. def encode_image(image_path):
  39.     with open(image_path, "rb") as image_file:
  40.         return base64.b64encode(image_file.read()).decode("utf-8")
  41. def perform_gui_grounding_with_api(screenshot_path, user_query, model_id, min_pixels=3136, max_pixels=12845056):
  42.     """
  43.     Perform GUI grounding using Qwen model to interpret user query on a screenshot.
  44.     Args:
  45.         screenshot_path (str): Path to the screenshot image
  46.         user_query (str): User's query/instruction
  47.         model: Preloaded Qwen model
  48.         min_pixels: Minimum pixels for the image
  49.         max_pixels: Maximum pixels for the image
  50.     Returns:
  51.         tuple: (output_text, display_image) - Model's output text and annotated image
  52.     """
  53.     # Open and process image
  54.     input_image = Image.open(screenshot_path)
  55.     base64_image = encode_image(screenshot_path)
  56.     client = OpenAI(
  57.         # If the environment variable is not configured, please replace the following line with the Dashscope API Key: api_key="sk-xxx". Access via https://bailian.console.alibabacloud.com/?apiKey=1 "
  58.         api_key="xxx",
  59.         base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
  60.     )
  61.     resized_height, resized_width = smart_resize(
  62.         input_image.height,
  63.         input_image.width,
  64.         min_pixels=min_pixels,
  65.         max_pixels=max_pixels,
  66.     )
  67.     # Initialize computer use function
  68.     computer_use = ComputerUse(
  69.         cfg={"display_width_px": resized_width, "display_height_px": resized_height}
  70.     )
  71.     # Build messages
  72.     system_message = NousFnCallPrompt.preprocess_fncall_messages(
  73.         messages=[
  74.             Message(role="system", content=[ContentItem(text="You are a helpful assistant.")]),
  75.         ],
  76.         functions=[computer_use.function],
  77.         lang=None,
  78.     )
  79.     system_message = system_message[0].model_dump()
  80.     messages = [
  81.         {
  82.             "role": "system",
  83.             "content": [
  84.                 {"type": "text", "text": msg["text"]} for msg in system_message["content"]
  85.             ],
  86.         },
  87.         {
  88.             "role": "user",
  89.             "content": [
  90.                 {
  91.                     "type": "image_url",
  92.                     "min_pixels": min_pixels,
  93.                     "max_pixels": max_pixels,
  94.                     # Pass in BASE64 image data. Note that the image format (i.e., image/{format}) must match the Content Type in the list of supported images. "f" is the method for string formatting.
  95.                     # PNG image:  f"data:image/png;base64,{base64_image}"
  96.                     # JPEG image: f"data:image/jpeg;base64,{base64_image}"
  97.                     # WEBP image: f"data:image/webp;base64,{base64_image}"
  98.                     "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
  99.                 },
  100.                 {"type": "text", "text": user_query},
  101.             ],
  102.         }
  103.     ]
  104.     # print(json.dumps(messages, indent=4))
  105.     completion = client.chat.completions.create(
  106.         model=model_id,
  107.         messages=messages,
  108.     )
  109.     output_text = completion.choices[0].message.content
  110.     # Parse action and visualize
  111.     # print(output_text)
  112.     action = json.loads(output_text.split('<tool_call>\n')[1].split('\n</tool_call>')[0])
  113.     # display_image = input_image.resize((resized_width, resized_height))
  114.     # display_image = draw_point(input_image, action['arguments']['coordinate'], color='green')
  115.     return output_text
  116. if __name__ == "__main__":
  117.     screenshot = "screenshot.png"
  118.     user_query = '在聊天框输入内容:下午好!'
  119.     model_id = "qwen2.5-vl-7b-instruct"
  120.     output_text = perform_gui_grounding_with_api(screenshot, user_query, model_id)
  121.     print(output_text)
复制代码


免责声明:如果侵犯了您的权益,请联系站长,我们会及时删除侵权内容,谢谢合作!更多信息从访问主页:qidao123.com:ToB企服之家,中国第一个企服评测及商务社交产业平台。

本帖子中包含更多资源

您需要 登录 才可以下载或查看,没有账号?立即注册

x
回复

使用道具 举报

0 个回复

倒序浏览

快速回复

您需要登录后才可以回帖 登录 or 立即注册

本版积分规则

农妇山泉一亩田

金牌会员
这个人很懒什么都没写!
快速回复 返回顶部 返回列表