系统架构设计
class Diagram:
def __init__(self):
self.modules = [
"URL管理器",
"下载器",
"解析器",
"存储器"
]
def show_architecture(self):
import matplotlib.patches as patches
fig, ax = plt.subplots(figsize=(10,6))
# 绘制模块框图
for i, module in enumerate(self.modules):
rect = patches.Rectangle(
(1, i*2+1), 3, 1.5,
linewidth=2,
edgecolor='#4B0082',
facecolor='#E6E6FA'
)
ax.add_patch(rect)
plt.text(2.5, i*2+1.7, module, ha='center', fontsize=12)
# 绘制连接箭头
for i in range(len(self.modules)-1):
plt.arrow(
2.5, i*2+2.3, 0, -0.8,
head_width=0.2,
head_length=0.2,
fc='#9370DB',
ec='#9370DB'
)
plt.xlim(0,5)
plt.ylim(0,9)
plt.axis('off')
plt.title('系统架构图', fontsize=14)
plt.savefig('architecture.png', dpi=300)
多线程爬虫类
import threading
from queue import Queue
class ThreadedSpider:
def __init__(self, max_threads=5):
self.queue = Queue()
self.lock = threading.Lock()
self.threads = []
self.max_threads = max_threads
def worker(self):
while True:
url = self.queue.get()
if url is None:
break
try:
html = self.fetch(url)
data = self.parse(html)
self.store(data)
finally:
self.queue.task_done()
def run(self, urls):
for _ in range(self.max_threads):
t = threading.Thread(target=self.worker)
t.start()
self.threads.append(t)
for url in urls:
self.queue.put(url)
self.queue.join()
# 停止线程
for _ in range(self.max_threads):
self.queue.put(None)
for t in self.threads:
t.join()
性能测试结果
def draw_performance_comparison():
data = {
'单线程': [12.5, 24.8, 51.3],
'多线程(5)': [3.2, 6.7, 13.1],
'异步爬虫': [2.8, 5.4, 11.2]
}
df = pd.DataFrame(data, index=['100页', '200页', '500页'])
ax = df.plot(
kind='bar',
figsize=(10,6),
color=['#8A2BE2', '#BA55D3', '#DDA0DD'],
edgecolor='#4B0082',
width=0.8
)
plt.title('不同并发模式性能对比', fontsize=14)
plt.xlabel('抓取规模', fontsize=12)
plt.ylabel('耗时(秒)', fontsize=12)
plt.xticks(rotation=0)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.savefig('performance_comparison.png', dpi=300, bbox_inches='tight')