python爬虫并保存excel实例
Python实现爬取亚马逊数据并打印出Excel文件操作示例本文实例讲述了python实现爬取亚马逊数据并打印出excel文件操作。分享给大家供大家参考,具体如下:
python大神们别喷,代码写的很粗糙,主要是完成功能,能够借鉴就看下吧,我是学java的,毕竟不是学python的,自己自学看了一点点python,望谅解。
|
#!/usr/bin/env python3 # encoding=utf-8 import sys import re import urllib.request import json import time import zlib from html import unescape import threading import os import xlwt import math import requests #例如这里设置递归为一百万 sys.setrecursionlimit( 1000000000 ) ##获取所有列别 def getprourl(): urllist = [] headers = { "user-agent" : "mozilla/5.0 (windows nt 10.0; wow64) applewebkit/537.36 (khtml, like gecko) chrome/50.0.2661.102 safari/537.36" } session = requests.session() furl = "https://www.amazon.cn/?tag=baidu250-23&hvadid={creative}&ref=pz_ic_22fvxh4dwf_e&page=" for i in range ( 0 , 1 ): html = "" html = session.post(furl + str (i),headers = headers) html.encoding = 'utf-8' s = html.text.encode( 'gb2312' , 'ignore' ).decode( 'gb2312' ) url = r '</li><li id=".*?" data-asin="(.+?)" class="s-result-item celwidget">' reg = re. compile (url,re.m) name = '"category" : "' + '(.*?)' + '"' reg1=re.compile(name,re.s) urllist = reg1.findall(html.text) return urllist ##根据类别获取数据链接 def geturldata(ci): url="https://www.amazon.cn/s/ref=nb_sb_noss_2?__mk_zh_cn=%e4%ba%9a%e9%a9%ac%e9%80%8a%e7%bd%91%e7%ab%99&url=search-alias%3daps&field-keywords="+ci+"&page=1&sort=review-rank" return url ##定时任务,等待1秒在进行 def fun_timer(): time.sleep(3) ##根据链接进行查询每个类别的网页内容 def getprodata(allurllist): webcontenthtmllist = [] headers = {"user-agent": "mozilla/5.0 (windows nt 10.0; wow64) applewebkit/537.36 (khtml, like gecko) chrome/50.0.2661.102 safari/537.36"} for ci in allurllist: session = requests.session() fun_timer() html = session.get(geturldata(ci),headers = headers) # 设置编码 html.encoding = 'utf-8' html.text.encode('gb2312', 'ignore').decode('gb2312') gxg = r'</li><li id=".*?" data-asin="(.+?)" class="s-result-item celwidget">' reg = re.compile(gxg, re.m) items = reg.findall(html.text) print(html.text) webcontenthtmllist.append(html.text) return webcontenthtmllist ##根据网页内容过滤需要的属性和值 def getprovalue(): list1 = [] * 5 list2 = [] * 5 list3 = [] * 5 list4 = [] * 5 list5 = [] * 5 list6 = [] * 5 list7 = [] * 5 list8 = [] * 5 urllist = getprourl(); urllist.remove('全部分类') urllist.remove('prime会员优先购') index = 0 for head in urllist: if index >= 0 and index < 5: list1.append(head) index = index + 1 if index >= 5 and index < 10: list2.append(head) index = index + 1 if index >= 10 and index < 15: list3.append(head) index = index + 1 if index >= 15 and index < 20: list4.append(head) index = index + 1 if index >= 20 and index < 25: list5.append(head) index = index + 1 if index >= 25 and index < 30: list6.append(head) index = index + 1 if index >= 30 and index < 35: list7.append(head) index = index + 1 if index >= 35 and index < 40: list8.append(head) index = index + 1 webcontenthtmllist1 = [] webcontenthtmllist1 = getprodata(list1) webcontenthtmllist2 = [] webcontenthtmllist2 = getprodata(list2) webcontenthtmllist3 = [] webcontenthtmllist3 = getprodata(list3) webcontenthtmllist4 = [] webcontenthtmllist4 = getprodata(list4) webcontenthtmllist5 = [] webcontenthtmllist5 = getprodata(list5) webcontenthtmllist6 = [] webcontenthtmllist6 = getprodata(list6) webcontenthtmllist7 = [] webcontenthtmllist7 = getprodata(list7) webcontenthtmllist8 = [] webcontenthtmllist8 = getprodata(list8) ##存储所有数据的集合 datatwoalllist1 = [] print("开始检索数据,检索数据中..........") ##网页内容1 for html in webcontenthtmllist1: for i in range(15): datalist = [] datalist.append(unescape(getprocategory(html,i))) datalist.append(unescape(getprotitle(html,i))) datalist.append(getproprice(html,i)) datalist.append(getsellercount(html,i)) datalist.append(getprostar(html,i)) datalist.append(getprocommentcount(html,i)) print(datalist) datatwoalllist1.append(datalist) ##网页内容2 for html in webcontenthtmllist2: for i in range(15): datalist = [] datalist.append(unescape(getprocategory(html,i))) datalist.append(unescape(getprotitle(html,i))) datalist.append(getproprice(html,i)) datalist.append(getsellercount(html,i)) datalist.append(getprostar(html,i)) datalist.append(getprocommentcount(html,i)) print(datalist) datatwoalllist1.append(datalist) ##网页内容3 for html in webcontenthtmllist3: for i in range(15): datalist = [] datalist.append(unescape(getprocategory(html,i))) datalist.append(unescape(getprotitle(html,i))) datalist.append(getproprice(html,i)) datalist.append(getsellercount(html,i)) datalist.append(getprostar(html,i)) datalist.append(getprocommentcount(html,i)) print(datalist) datatwoalllist1.append(datalist) ##网页内容4 for html in webcontenthtmllist4: for i in range(15): datalist = [] datalist.append(unescape(getprocategory(html,i))) datalist.append(unescape(getprotitle(html,i))) datalist.append(getproprice(html,i)) datalist.append(getsellercount(html,i)) datalist.append(getprostar(html,i)) datalist.append(getprocommentcount(html,i)) print(datalist) datatwoalllist1.append(datalist) ##网页内容5 for html in webcontenthtmllist5: for i in range(15): datalist = [] datalist.append(unescape(getprocategory(html,i))) datalist.append(unescape(getprotitle(html,i))) datalist.append(getproprice(html,i)) datalist.append(getsellercount(html,i)) datalist.append(getprostar(html,i)) datalist.append(getprocommentcount(html,i)) print(datalist) datatwoalllist1.append(datalist) ##网页内容6 for html in webcontenthtmllist6: for i in range(15): datalist = [] datalist.append(unescape(getprocategory(html,i))) datalist.append(unescape(getprotitle(html,i))) datalist.append(getproprice(html,i)) datalist.append(getsellercount(html,i)) datalist.append(getprostar(html,i)) datalist.append(getprocommentcount(html,i)) print(datalist) datatwoalllist1.append(datalist) ##网页内容7 猜您喜欢
热门推荐
|