微信公众号文章 爬虫
python抓取搜狗微信公众号文章初学python,抓取搜狗微信公众号文章存入mysql
mysql表:
代码:
|
import requests import json import re import pymysql # 创建连接 conn = pymysql.connect(host = '你的数据库地址' , port = 端口, user = '用户名' , passwd = '密码' , db = '数据库名称' , charset = 'utf8' ) # 创建游标 cursor = conn.cursor() cursor.execute( "select * from hd_gzh" ) effect_row = cursor.fetchall() from bs4 import beautifulsoup socket.setdefaulttimeout( 60 ) count = 1 headers = { 'user-agent' : 'mozilla/5.0 (windows nt 10.0; win64; x64; rv:65.0) gecko/20100101 firefox/65.0' } #阿布云ip代理暂时不用 # proxyhost = "http-cla.abuyun.com" # proxyport = "9030" # # 代理隧道验证信息 # proxyuser = "h56761606429t7uc" # proxypass = "9168eb00c4167176" # proxymeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % { # "host" : proxyhost, # "port" : proxyport, # "user" : proxyuser, # "pass" : proxypass, # } # proxies = { # "http" : proxymeta, # "https" : proxymeta, # } #查看是否已存在数据 def checkdata(name): sql = "select * from gzh_article where title = '%s'" data = (name,) count = cursor.execute(sql % data) conn.commit() if (count! = 0 ): return false else : return true #插入数据 def insertdata(title,picture,author,content): sql = "insert into gzh_article (title,picture,author,content) values ('%s', '%s','%s', '%s')" data = (title,picture,author,content) cursor.execute(sql % data) conn.commit() print ( "插入一条数据" ) return for row in effect_row: newsurl = 'https://weixin.sogou.com/weixin?type=1&s_from=input&query=' + row[ 1 ] + '&ie=utf8&_sug_=n&_sug_type_=' res = requests.get(newsurl,headers = headers) res.encoding = 'utf-8' soup = beautifulsoup(res.text, 'html.parser' ) url = 'https://weixin.sogou.com' + soup.select( '.tit a' )[ 0 ][ 'href' ] res2 = requests.get(url,headers = headers) res2.encoding = 'utf-8' soup2 = beautifulsoup(res2.text, 'html.parser' ) pattern = re. compile (r "url \+= '(.*?)';" , re.multiline | re.dotall) script = soup2.find( "script" ) url2 = pattern.search(script.text).group( 1 ) res3 = requests.get(url2,headers = headers) res3.encoding = 'utf-8' soup3 = beautifulsoup(res3.text, 'html.parser' ) print () pattern2 = re. compile (r "var msglist = (.*?);$" , re.multiline | re.dotall) script2 = soup3.find( "script" , text = pattern2) s2 = json.loads(pattern2.search(script2.text).group( 1 )) #等待10s time.sleep( 10 ) for news in s2[ "list" ]: articleurl = "https://mp.weixin.qq.com" + news[ "app_msg_ext_info" ][ "content_url" ] articleurl = articleurl.replace( '&' , '&' ) res4 = requests.get(articleurl,headers = headers) res4.encoding = 'utf-8' soup4 = beautifulsoup(res4.text, 'html.parser' ) if (checkdata(news[ "app_msg_ext_info" ][ "title" ])): insertdata(news[ "app_msg_ext_info" ][ "title" ],news[ "app_msg_ext_info" ][ "cover" ],news[ "app_msg_ext_info" ][ "author" ],pymysql.escape_string( str (soup4))) count + = 1 #等待5s time.sleep( 10 ) for news2 in news[ "app_msg_ext_info" ][ "multi_app_msg_item_list" ]: articleurl2 = "https://mp.weixin.qq.com" + news2[ "content_url" ] articleurl2 = articleurl2.replace( '&' , '&' ) res5 = requests.get(articleurl2,headers = headers) res5.encoding = 'utf-8' soup5 = beautifulsoup(res5.text, 'html.parser' ) if (checkdata(news2[ "title" ])): insertdata(news2[ "title" ],news2[ "cover" ],news2[ "author" ],pymysql.escape_string( str (soup5))) count + = 1 #等待10s time.sleep( 10 ) cursor.close() conn.close() print ( "操作完成" ) |
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持开心学习网。
原文链接:https://blog.csdn.net/a2398936046/article/details/88814078