from openpyxl import load_workbook,Workbook
from openpyxl.drawing.image import Image
from openpyxl.styles import Alignment
from PIL import Image as I
from concurrent.futures import ThreadPoolExecutor
import requests
from lxml import etree
import os
import csv
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
}
def GetUrl(url,div_num=6):
response = requests.get(url=url, headers=headers)
response.encoding='gb2312'
tree = etree.HTML(response.text)
lis = tree.xpath(f'/html/body/div[{div_num}]/ul/li')
for li in range(1,len(lis)+1):
url_a ="https://www.4399.com" + tree.xpath(f'/html/body/div[{div_num}]/ul/li[{li}]/a/@href')[0]
every_urls.append(url_a)
def Get_Data(url):
f = open("./game_data.csv","a",newline="",encoding="utf-8")
w = csv.writer(f)
response = requests.get(url=url, headers=headers)
response.encoding = 'gb2312'
if response.status_code !=200:
return
tree = etree.HTML(response.text)
a = 'https://www.4399.com'+tree.xpath('/html/body/div[7]/div[1]/div[1]/div[2]/div[1]/h1/a/@href')[0]
title = tree.xpath('/html/body/div[7]/div[1]/div[1]/div[2]/div[1]/h1/a/text()')[0]
font = tree.xpath('/html/body/div[7]/div[1]/div[1]/div[2]/div[4]/div/font/text()')[0]
w.writerow([title,a,font])
img = 'https:' + tree.xpath( '/html/body/div[7]/div[1]/div[1]/div[1]/div[1]/a/img/@src')[0]
type = img.split(".")[-1]
response = requests.get(url=img, headers=headers).content
with open(f"./游戏图片/{title}.{type}", "wb") as w:
w.write(response)
test.append("1")
def Save():
f = open("game_data.csv","r",encoding="utf-8",errors='ignore')
datas = list(csv.reader(f))
length = len(datas)
wb = Workbook()
sheet = wb.active
alignment = Alignment(horizontal='center', vertical='center')
for num in range(1,length+1):
sheet.row_dimensions[num].height=75.5
sheet.column_dimensions['A'].width=20
try:
tp = I.open(f'./游戏图片/{datas[num-1][0]}.jpg')
w,h = tp.size
np = tp.resize((w//2,h//2))
np.save(f'./图片缓存/{datas[num-1][0]}.jpg')
image_path =f'./图片缓存/{datas[num-1][0]}.jpg'
except:
image_path = f'./无.jpg'
try:
img = Image(image_path)
sheet.add_image(img,f'a{num}')
sheet.column_dimensions['B'].width = 22
sheet[f'B{num}'] = datas[num-1][0]
sheet[f'B{num}'].alignment = alignment
sheet.column_dimensions['C'].width = 38.18
sheet[f'C{num}'] = datas[num-1][1]
sheet[f'C{num}'].alignment = alignment
sheet[f'D{num}'] = datas[num-1][2]
sheet[f'D{num}'].alignment = alignment
except:
pass
wb.save('game.xlsx')
if __name__ == '__main__':
test = []
if not os.path.exists('./图片缓存'):
os.mkdir('./图片缓存')
print("文件已创建")
f = open("./game_data.csv","w",encoding="utf-8",newline="")
every_urls = []
urls = []
for i in range(2, 11):
urls.append(f'https://www.4399.com/flash/new_{i}.htm')
GetUrl("https://www.4399.com/flash/new.htm",div_num=8)
with ThreadPoolExecutor(max_workers=10) as e:
for url in urls:
e.submit(GetUrl,url)
print("多线程爬取到的所有url链接···")
with ThreadPoolExecutor(max_workers=100) as e:
for url in every_urls:
e.submit(Get_Data,url)
print("链接总共的条数有:",len(test))
f.close()
print("等待图片跟数据的保存···")
Save()
print("数据已获取保存")