OpenClaw 精简版实现

openclaw AI小龙虾攻略 1

OpenClaw 是一个功能强大的抓取工具,这里提供一个精简版本的核心实现:

OpenClaw 精简版实现-第1张图片-官方获取 | OpenClaw下载 - openclaw官网

主要特性

  • 简单易用的API
  • 支持多线程抓取
  • 自动重试机制
  • 灵活的解析器

核心代码实现

import requests
import time
import threading
from queue import Queue
from typing import Optional, List, Dict, Any
from urllib.parse import urljoin, urlparse
import logging
class OpenClawLite:
    """OpenClaw 精简版"""
    def __init__(
        self,
        max_workers: int = 5,
        delay: float = 0.5,
        timeout: int = 10,
        max_retries: int = 3,
        user_agent: Optional[str] = None
    ):
        """
        初始化 OpenClaw 精简版
        参数:
            max_workers: 最大工作线程数
            delay: 请求延迟(秒)
            timeout: 请求超时时间(秒)
            max_retries: 最大重试次数
            user_agent: 自定义 User-Agent
        """
        self.max_workers = max_workers
        self.delay = delay
        self.timeout = timeout
        self.max_retries = max_retries
        self.session = requests.Session()
        if user_agent:
            self.session.headers.update({'User-Agent': user_agent})
        else:
            self.session.headers.update({
                'User-Agent': 'OpenClawLite/1.0'
            })
        self.queue = Queue()
        self.results = []
        self.visited = set()
        self.lock = threading.Lock()
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s'
        )
        self.logger = logging.getLogger(__name__)
    def fetch(
        self,
        url: str,
        method: str = 'GET',
        data: Optional[Dict] = None,
        headers: Optional[Dict] = None,
        **kwargs
    ) -> Optional[requests.Response]:
        """
        发送 HTTP 请求
        参数:
            url: 目标URL
            method: HTTP方法
            data: POST数据
            headers: 请求头
            **kwargs: 其他 requests 参数
        返回:
            Response 对象或 None
        """
        for attempt in range(self.max_retries):
            try:
                if method.upper() == 'POST':
                    response = self.session.post(
                        url,
                        data=data,
                        headers=headers,
                        timeout=self.timeout,
                        **kwargs
                    )
                else:
                    response = self.session.get(
                        url,
                        headers=headers,
                        timeout=self.timeout,
                        **kwargs
                    )
                response.raise_for_status()
                return response
            except requests.RequestException as e:
                self.logger.warning(f"请求失败 (尝试 {attempt + 1}/{self.max_retries}): {e}")
                if attempt < self.max_retries - 1:
                    time.sleep(2 ** attempt)  # 指数退避
                continue
        self.logger.error(f"所有重试都失败: {url}")
        return None
    def parse_links(self, html: str, base_url: str) -> List[str]:
        """
        从HTML中解析链接(简单实现)
        参数:
            html: HTML内容
            base_url: 基础URL
        返回:
            链接列表
        """
        import re
        links = []
        # 简单的正则匹配,实际使用建议用 BeautifulSoup
        pattern = r'href=[\'"]?([^\'" >]+)'
        matches = re.findall(pattern, html)
        for match in matches:
            # 转换为绝对URL
            absolute_url = urljoin(base_url, match)
            # 验证URL格式
            parsed = urlparse(absolute_url)
            if parsed.scheme and parsed.netloc:
                links.append(absolute_url)
        return links
    def worker(self):
        """工作线程函数"""
        while True:
            try:
                url, callback, depth = self.queue.get(timeout=30)
                # 检查是否已访问
                with self.lock:
                    if url in self.visited:
                        self.queue.task_done()
                        continue
                    self.visited.add(url)
                self.logger.info(f"抓取: {url} (深度: {depth})")
                # 获取页面
                response = self.fetch(url)
                if response:
                    # 调用回调函数处理结果
                    result = callback(response, url, depth) if callback else response.text
                    with self.lock:
                        self.results.append({
                            'url': url,
                            'content': result,
                            'depth': depth
                        })
                    # 如果还有深度,解析链接并加入队列
                    if depth > 0:
                        links = self.parse_links(response.text, url)
                        for link in links:
                            with self.lock:
                                if link not in self.visited:
                                    self.queue.put((link, callback, depth - 1))
                # 延迟防止请求过快
                time.sleep(self.delay)
                self.queue.task_done()
            except Queue.Empty:
                break
            except Exception as e:
                self.logger.error(f"工作线程错误: {e}")
                self.queue.task_done()
    def crawl(
        self,
        start_urls: List[str],
        max_depth: int = 1,
        callback: Optional[callable] = None
    ) -> List[Dict[str, Any]]:
        """
        开始抓取
        参数:
            start_urls: 起始URL列表
            max_depth: 最大爬取深度
            callback: 自定义回调函数
        返回:
            抓取结果列表
        """
        # 清空结果
        self.results = []
        self.visited = set()
        # 添加起始URL到队列
        for url in start_urls:
            self.queue.put((url, callback, max_depth))
        # 创建工作线程
        threads = []
        for _ in range(min(self.max_workers, len(start_urls))):
            thread = threading.Thread(target=self.worker)
            thread.daemon = True
            thread.start()
            threads.append(thread)
        # 等待队列完成
        self.queue.join()
        # 等待所有线程结束
        for thread in threads:
            thread.join(timeout=5)
        return self.results
    def save_results(self, filename: str, format: str = 'json'):
        """
        保存结果到文件
        参数:
            filename: 文件名
            format: 格式 (json, txt)
        """
        import json
        with open(filename, 'w', encoding='utf-8') as f:
            if format.lower() == 'json':
                json.dump(self.results, f, ensure_ascii=False, indent=2)
            else:
                for result in self.results:
                    f.write(f"URL: {result['url']}\n")
                    f.write(f"Depth: {result['depth']}\n")
                    f.write(f"Content:\n{result['content'][:500]}...\n")
                    f.write("-" * 50 + "\n")
        self.logger.info(f"结果已保存到: {filename}")
    def close(self):
        """关闭会话"""
        self.session.close()
# 使用示例
def example_callback(response, url, depth):
    """示例回调函数 - 提取标题和正文"""
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')
    # 提取标题= soup.title.string if soup.title else '无标题'
    # 提取正文(简单示例)
    text = soup.get_text()[:200]  # 前200个字符
    return {
        'url': url,
        'title': title.strip(),
        'preview': text.strip(),
        'status': response.status_code,
        'depth': depth
    }
if __name__ == "__main__":
    # 示例用法
    claw = OpenClawLite(
        max_workers=3,
        delay=1.0,
        max_retries=2
    )
    try:
        # 开始抓取
        results = claw.crawl(
            start_urls=['https://httpbin.org/html'],
            max_depth=1,
            callback=example_callback
        )
        # 打印结果
        for result in results:
            print(f"URL: {result['content']['url']}")
            print(f"标题: {result['content']['title']}")
            print(f"预览: {result['content']['preview']}")
            print()
        # 保存结果
        claw.save_results('results.json', 'json')
    finally:
        claw.close()

安装依赖的简化版本

如果你需要更轻量的版本(不使用 BeautifulSoup),可以创建纯 requests 版本:

# openclaw_mini.py
import requests
import json
import time
class OpenClawMini:
    """OpenClaw 超精简版"""
    def __init__(self):
        self.session = requests.Session()
        self.session.headers = {
            'User-Agent': 'OpenClawMini/1.0'
        }
    def get(self, url, **kwargs):
        """简单的 GET 请求"""
        try:
            response = self.session.get(url, timeout=10, **kwargs)
            return response.text
        except Exception as e:
            print(f"请求失败: {e}")
            return None
    def post(self, url, data=None, **kwargs):
        """简单的 POST 请求"""
        try:
            response = self.session.post(url, data=data, timeout=10, **kwargs)
            return response.text
        except Exception as e:
            print(f"请求失败: {e}")
            return None
    def scrape(self, urls):
        """批量抓取"""
        results = []
        for url in urls:
            print(f"抓取: {url}")
            html = self.get(url)
            if html:
                results.append({
                    'url': url,
                    'content': html[:1000]  # 只保存前1000字符
                })
            time.sleep(1)  # 礼貌延迟
        return results
# 使用示例
if __name__ == "__main__":
    claw = OpenClawMini()
    # 抓取单个页面
    html = claw.get("https://httpbin.org/html")
    if html:
        print(f"获取到 {len(html)} 字符")
    # 批量抓取
    results = claw.scrape([
        "https://httpbin.org/html",
        "https://httpbin.org/get"
    ])
    # 保存结果
    with open('scraped.json', 'w') as f:
        json.dump(results, f, indent=2)

快速开始

  1. 安装依赖(完整版):

    pip install requests beautifulsoup4
  2. 使用完整版:

    from openclaw_lite import OpenClawLite

claw = OpenClawLite(max_workers=3) results = claw.crawl(['https://example.com'], max_depth=1)


3. 使用超精简版(无额外依赖):
```python
from openclaw_mini import OpenClawMini
claw = OpenClawMini()
html = claw.get('https://example.com')

这个精简版保留了 OpenClaw 的核心功能,适合学习和快速开发使用,对于生产环境,建议使用成熟的框架如 Scrapy 或添加更多错误处理和功能。

标签: OpenClaw 精简版实现

抱歉,评论功能暂时关闭!