python3爬虫实例代码
python3通过selenium爬虫获取到dj商品的实例代码先给大家介绍下python3 selenium使用
其实这个就相当于模拟人的点击事件来连续的访问浏览器。如果你玩过王者荣耀的话在2016年一月份的版本里面就有一个bug。
安卓手机下载一个按键精灵就可以在冒险模式里面设置按键,让手机自动玩闯关,一局19个金币,一晚上就一个英雄了。不过
程序员也不是吃素的。给一个星期设置了大概4000金币上限。有兴趣的可以去试试。(注:手机需要root)
进入正题:
|
from selenium import webdriver from selenium.webdriver.common.by import by from selenium.webdriver.common.keys import keys from selenium.webdriver.support import expected_conditions as ec from selenium.webdriver.support.wait import webdriverwait |
在写之前需要下载selenium模块
|
brguge = webdriver.chrome() #声明驱动对象 try : brguge.get( 'https://www.baidu.com' ) #发送get请求 input = brguge.find_element_by_id( 'kw' ) #找到目标 input .send_keys( 'python' ) #输入python关键字 input .send_keys(keys.enter) #敲入回车 wait = webdriverwait(brguge, 10 ) #等待元素加载出来 wait.until(ec.presence_of_element_located(by. id , 'content_left' )) #加载 print (brguge.current_url) #输出搜索的路径 print (brguge.get_cookie()) #输出cookie print (brguge.page_source) #输出结果源代码 finally : brguge.close() #关闭谷歌浏览器 |
下面是一些selenium模块的基本用法
查找元素
单个元素
|
( from selenium import webdriver) brguge.find_element_by_id( 'q' )用这个元素找 id 是q的元素 brguge.find_element_by_css_selector( '#q' )找css样式是q的 brguge.find_element_by_xpath( '//*[ @id="q"]' )三个效果一样 brguge.find_element_by_name()通过name来查找 brguge.find_element_by_link_text()通过link来查找 brguge.find_element_by_partial_link_text() brguge.find_element_by_tag_name() brguge.find_element_by_class_name()通过 class 查找 from selenium import webdriver from selenium.webdriver.common.by import by brguge.find_element(by. id , 'q' )通用查找方式 |
多个元素(find_elements)加了个s
他会以列表的形式打印出来
brguge.find_elements_by_css_selector('.service-bd li')css样式为li的元素
brguge.find_elements(by.css_selector,'.service-bd li')两个作用一样
(利用索引就可以获取单个或多个元素了)
元素交互操作(获取元素然后再给他指令)
选择输入框 --》send_keys('输入文字')--》clear()清空输入框--在输入别的--》找到搜索--》click(点击)
input.clear()清空按钮
交互动作(将动作附加到动作链中串行执行)
switch_to_frame('iframeresult')
用css样式分别找到两个要交互
调用actionchains(调用谷歌的)
drag_and_drop(source,target)第一个到第二个上面
perform()
下面看下python3通过selenium爬虫获取到dj商品的实例代码。
具体代码如下所示:
|
from selenium import webdriver from selenium.webdriver.common.by import by from selenium.webdriver.common.keys import keys from selenium.webdriver.support.wait import webdriverwait from selenium.webdriver.support import expected_conditions as ec from selenium.webdriver.chrome.options import options from selenium.common.exceptions import nosuchelementexception from lxml import etree import time, json jd_url_login = "https://www.jd.com/" class customizeexception(exception): def __init__( self , status, msg): self .status = status self .msg = msg class jd: def __init__( self ): self .browser = none self .__init_browser() def __init_browser( self ): options = options() options.add_argument( "--headless" ) options.add_experimental_option( 'excludeswitches' , [ 'enable-automation' ]) # 设置为无图模式 options.add_experimental_option( "prefs" , { "profile.managed_default_content_settings.images" : 2 }) self .browser = webdriver.chrome(options = options) # 设置浏览器最大化窗口 self .browser.maximize_window() # 隐式等待时间为3s self .browser.implicitly_wait( 3 ) self .browser.get(jd_url_login) self .wait = webdriverwait( self .browser, 10 ) def __search_goods( self , goods): '''搜索商品的方法''' self . file = open ( "jd-{}.json" . format (goods), "a" , encoding = "utf-8" ) self .wait.until(ec.presence_of_all_elements_located((by. id , "key" ))) serach_input = self .browser.find_element_by_id( "key" ) serach_input.clear() serach_input.send_keys(goods, keys.enter) def __get_goods_info( self , page_source): '''从网页源码中获取到想要的数据''' selector_html = etree.html(page_source) # 商品名字 不要获取title属性,以后再改吧,最好是获取到商品名的文本内容 goods_name = selector_html.xpath( "//li[@class='gl-i-wrap']//li[contains(@class,'p-name')]/a/@title" ) # 商品价格 goods_price = selector_html.xpath( "//li[@class='gl-i-wrap']//li[@class='p-price']/strong/i/text()" ) # 商品评价数量 comment_num_selector = selector_html.xpath( "//li[@class='p-commit']/strong" ) comment_num = [selector.xpath( "string(.)" ) for selector in comment_num_selector] # 商品店铺 shop_name = selector_html.xpath( "//a[@class='curr-shop']/text()" ) goods_zip = zip (goods_name, goods_price, comment_num, shop_name) for goods_info in goods_zip: dic = {} dic[ "goods_name" ] = goods_info[ 0 ] dic[ "goods_price" ] = goods_info[ 1 ] dic[ "comment_num" ] = goods_info[ 2 ] dic[ "shop_name" ] = goods_info[ 3 ] # print("商品名字>>:", goods_info[0]) # print("商品价格>>:", goods_info[1]) # print("商品评价数量>>:", goods_info[2]) # print("商品店铺>>:", goods_info[3]) # print("*" * 100) yield dic def __swipe_page( self ): '''上下滑动页面,将完整的网页源码返回''' height = self .browser.execute_script( "return document.body.scrollheight;" ) js = "window.scrollto(0, {});" . format (height) self .browser.execute_script(js) while true: time.sleep( 1 ) now_height = self .browser.execute_script( "return document.body.scrollheight;" ) if height = = now_height: return self .browser.page_source js = "window.scrollto({}, {});" . format (height, now_height) self .browser.execute_script(js) height = now_height def __is_element_exists( self , xpath): '''检测一个xpath是否能够找到''' try : self .browser.find_element_by_xpath(xpath = xpath) return true except nosuchelementexception: return false def __click_next_page( self ): '''点击下一页,实现翻页功能''' self .wait.until(ec.presence_of_all_elements_located((by.class_name, "pn-next" ))) xpath = "//a[@class='pn-next']" if not self .__is_element_exists(xpath): raise customizeexception( 10000 , "该商品访问完毕" ) self .browser.find_element_by_xpath(xpath).click() def __write_to_json( self , dic: dict ): data_json = json.dumps(dic, ensure_ascii = false) self . file .write(data_json + "\n" ) def run( self , goods): self .__search_goods(goods) n = 1 while true: print ( "正在爬取商品 <{}>---第{}页......" . format (goods, n)) time.sleep( 3 ) html = self .__swipe_page() for dic in self .__get_goods_info(html): self .__write_to_json(dic) try : self .__click_next_page() except customizeexception: try : goods = goods_list.pop( 0 ) self .run(goods) except indexerror: return n + = 1 def __del__( self ): self .browser.close() self . file .close() if __name__ = = '__main__' : jd = jd() goods_list = [ "纯牛奶" , "酸奶" , "奶茶" , "床上用品" , "电磁炉" , "电视" , "小米笔记本" , "华硕笔记本" , "联想笔记本" , "男士洗面奶" , "女士洗面奶" , "沐浴露" , "洗发露" , "牙刷" , "牙膏" , "拖鞋" , "剃须刀" , "水手服" , "运动服" , "红龙果" , "苹果" , "香蕉" , "洗衣液" , "电饭煲" ] try : goods = goods_list.pop( 0 ) except indexerror: raise customizeexception( 20000 , "goods_list不能为空" ) try : jd.run(goods) finally : del jd |
总结
以上所述是小编给大家介绍的python3通过selenium爬虫获取到dj商品的实例代码,希望对大家有所帮助,如果大家有任何疑问请给我留言,小编会及时回复大家的。在此也非常感谢大家对开心学习网网站的支持!
如果你觉得本文对你有帮助,欢迎转载,烦请注明出处,谢谢!原文链接:https://www.cnblogs.com/zhuchunyu/archive/2019/04/25/10765875.html