OpenClaw 是一个功能强大的抓取工具,这里提供一个精简版本的核心实现:

主要特性
- 简单易用的API
- 支持多线程抓取
- 自动重试机制
- 灵活的解析器
核心代码实现
import requests
import time
import threading
from queue import Queue
from typing import Optional, List, Dict, Any
from urllib.parse import urljoin, urlparse
import logging
class OpenClawLite:
"""OpenClaw 精简版"""
def __init__(
self,
max_workers: int = 5,
delay: float = 0.5,
timeout: int = 10,
max_retries: int = 3,
user_agent: Optional[str] = None
):
"""
初始化 OpenClaw 精简版
参数:
max_workers: 最大工作线程数
delay: 请求延迟(秒)
timeout: 请求超时时间(秒)
max_retries: 最大重试次数
user_agent: 自定义 User-Agent
"""
self.max_workers = max_workers
self.delay = delay
self.timeout = timeout
self.max_retries = max_retries
self.session = requests.Session()
if user_agent:
self.session.headers.update({'User-Agent': user_agent})
else:
self.session.headers.update({
'User-Agent': 'OpenClawLite/1.0'
})
self.queue = Queue()
self.results = []
self.visited = set()
self.lock = threading.Lock()
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
self.logger = logging.getLogger(__name__)
def fetch(
self,
url: str,
method: str = 'GET',
data: Optional[Dict] = None,
headers: Optional[Dict] = None,
**kwargs
) -> Optional[requests.Response]:
"""
发送 HTTP 请求
参数:
url: 目标URL
method: HTTP方法
data: POST数据
headers: 请求头
**kwargs: 其他 requests 参数
返回:
Response 对象或 None
"""
for attempt in range(self.max_retries):
try:
if method.upper() == 'POST':
response = self.session.post(
url,
data=data,
headers=headers,
timeout=self.timeout,
**kwargs
)
else:
response = self.session.get(
url,
headers=headers,
timeout=self.timeout,
**kwargs
)
response.raise_for_status()
return response
except requests.RequestException as e:
self.logger.warning(f"请求失败 (尝试 {attempt + 1}/{self.max_retries}): {e}")
if attempt < self.max_retries - 1:
time.sleep(2 ** attempt) # 指数退避
continue
self.logger.error(f"所有重试都失败: {url}")
return None
def parse_links(self, html: str, base_url: str) -> List[str]:
"""
从HTML中解析链接(简单实现)
参数:
html: HTML内容
base_url: 基础URL
返回:
链接列表
"""
import re
links = []
# 简单的正则匹配,实际使用建议用 BeautifulSoup
pattern = r'href=[\'"]?([^\'" >]+)'
matches = re.findall(pattern, html)
for match in matches:
# 转换为绝对URL
absolute_url = urljoin(base_url, match)
# 验证URL格式
parsed = urlparse(absolute_url)
if parsed.scheme and parsed.netloc:
links.append(absolute_url)
return links
def worker(self):
"""工作线程函数"""
while True:
try:
url, callback, depth = self.queue.get(timeout=30)
# 检查是否已访问
with self.lock:
if url in self.visited:
self.queue.task_done()
continue
self.visited.add(url)
self.logger.info(f"抓取: {url} (深度: {depth})")
# 获取页面
response = self.fetch(url)
if response:
# 调用回调函数处理结果
result = callback(response, url, depth) if callback else response.text
with self.lock:
self.results.append({
'url': url,
'content': result,
'depth': depth
})
# 如果还有深度,解析链接并加入队列
if depth > 0:
links = self.parse_links(response.text, url)
for link in links:
with self.lock:
if link not in self.visited:
self.queue.put((link, callback, depth - 1))
# 延迟防止请求过快
time.sleep(self.delay)
self.queue.task_done()
except Queue.Empty:
break
except Exception as e:
self.logger.error(f"工作线程错误: {e}")
self.queue.task_done()
def crawl(
self,
start_urls: List[str],
max_depth: int = 1,
callback: Optional[callable] = None
) -> List[Dict[str, Any]]:
"""
开始抓取
参数:
start_urls: 起始URL列表
max_depth: 最大爬取深度
callback: 自定义回调函数
返回:
抓取结果列表
"""
# 清空结果
self.results = []
self.visited = set()
# 添加起始URL到队列
for url in start_urls:
self.queue.put((url, callback, max_depth))
# 创建工作线程
threads = []
for _ in range(min(self.max_workers, len(start_urls))):
thread = threading.Thread(target=self.worker)
thread.daemon = True
thread.start()
threads.append(thread)
# 等待队列完成
self.queue.join()
# 等待所有线程结束
for thread in threads:
thread.join(timeout=5)
return self.results
def save_results(self, filename: str, format: str = 'json'):
"""
保存结果到文件
参数:
filename: 文件名
format: 格式 (json, txt)
"""
import json
with open(filename, 'w', encoding='utf-8') as f:
if format.lower() == 'json':
json.dump(self.results, f, ensure_ascii=False, indent=2)
else:
for result in self.results:
f.write(f"URL: {result['url']}\n")
f.write(f"Depth: {result['depth']}\n")
f.write(f"Content:\n{result['content'][:500]}...\n")
f.write("-" * 50 + "\n")
self.logger.info(f"结果已保存到: {filename}")
def close(self):
"""关闭会话"""
self.session.close()
# 使用示例
def example_callback(response, url, depth):
"""示例回调函数 - 提取标题和正文"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')
# 提取标题= soup.title.string if soup.title else '无标题'
# 提取正文(简单示例)
text = soup.get_text()[:200] # 前200个字符
return {
'url': url,
'title': title.strip(),
'preview': text.strip(),
'status': response.status_code,
'depth': depth
}
if __name__ == "__main__":
# 示例用法
claw = OpenClawLite(
max_workers=3,
delay=1.0,
max_retries=2
)
try:
# 开始抓取
results = claw.crawl(
start_urls=['https://httpbin.org/html'],
max_depth=1,
callback=example_callback
)
# 打印结果
for result in results:
print(f"URL: {result['content']['url']}")
print(f"标题: {result['content']['title']}")
print(f"预览: {result['content']['preview']}")
print()
# 保存结果
claw.save_results('results.json', 'json')
finally:
claw.close()
安装依赖的简化版本
如果你需要更轻量的版本(不使用 BeautifulSoup),可以创建纯 requests 版本:
# openclaw_mini.py
import requests
import json
import time
class OpenClawMini:
"""OpenClaw 超精简版"""
def __init__(self):
self.session = requests.Session()
self.session.headers = {
'User-Agent': 'OpenClawMini/1.0'
}
def get(self, url, **kwargs):
"""简单的 GET 请求"""
try:
response = self.session.get(url, timeout=10, **kwargs)
return response.text
except Exception as e:
print(f"请求失败: {e}")
return None
def post(self, url, data=None, **kwargs):
"""简单的 POST 请求"""
try:
response = self.session.post(url, data=data, timeout=10, **kwargs)
return response.text
except Exception as e:
print(f"请求失败: {e}")
return None
def scrape(self, urls):
"""批量抓取"""
results = []
for url in urls:
print(f"抓取: {url}")
html = self.get(url)
if html:
results.append({
'url': url,
'content': html[:1000] # 只保存前1000字符
})
time.sleep(1) # 礼貌延迟
return results
# 使用示例
if __name__ == "__main__":
claw = OpenClawMini()
# 抓取单个页面
html = claw.get("https://httpbin.org/html")
if html:
print(f"获取到 {len(html)} 字符")
# 批量抓取
results = claw.scrape([
"https://httpbin.org/html",
"https://httpbin.org/get"
])
# 保存结果
with open('scraped.json', 'w') as f:
json.dump(results, f, indent=2)
快速开始
-
安装依赖(完整版):
pip install requests beautifulsoup4
-
使用完整版:
from openclaw_lite import OpenClawLite
claw = OpenClawLite(max_workers=3) results = claw.crawl(['https://example.com'], max_depth=1)
3. 使用超精简版(无额外依赖):
```python
from openclaw_mini import OpenClawMini
claw = OpenClawMini()
html = claw.get('https://example.com')
这个精简版保留了 OpenClaw 的核心功能,适合学习和快速开发使用,对于生产环境,建议使用成熟的框架如 Scrapy 或添加更多错误处理和功能。
版权声明:除非特别标注,否则均为本站原创文章,转载时请以链接形式注明文章出处。