当前位置:脚本大全 > > 正文

微信公众号文章 爬虫(python抓取搜狗微信公众号文章)

时间:2021-10-26 11:09:27类别:脚本大全

微信公众号文章 爬虫

python抓取搜狗微信公众号文章

初学python,抓取搜狗微信公众号文章存入mysql

mysql表:

微信公众号文章 爬虫(python抓取搜狗微信公众号文章)

微信公众号文章 爬虫(python抓取搜狗微信公众号文章)

代码:

  • ?
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98
  • 99
  • 100
  • 101
  • 102
  • import requests
  • import json
  • import re
  • import pymysql
  •  
  • # 创建连接
  • conn = pymysql.connect(host='你的数据库地址', port=端口, user='用户名', passwd='密码', db='数据库名称', charset='utf8')
  • # 创建游标
  • cursor = conn.cursor()
  •  
  • cursor.execute("select * from hd_gzh")
  • effect_row = cursor.fetchall()
  • from bs4 import beautifulsoup
  •  
  • socket.setdefaulttimeout(60)
  • count = 1
  • headers = {'user-agent': 'mozilla/5.0 (windows nt 10.0; win64; x64; rv:65.0) gecko/20100101 firefox/65.0'}
  • #阿布云ip代理暂时不用
  • # proxyhost = "http-cla.abuyun.com"
  • # proxyport = "9030"
  • # # 代理隧道验证信息
  • # proxyuser = "h56761606429t7uc"
  • # proxypass = "9168eb00c4167176"
  •  
  • # proxymeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
  • #  "host" : proxyhost,
  • #  "port" : proxyport,
  • #  "user" : proxyuser,
  • #  "pass" : proxypass,
  • # }
  •  
  • # proxies = {
  • #   "http" : proxymeta,
  • #   "https" : proxymeta,
  • # }
  •  
  • #查看是否已存在数据
  • def checkdata(name):
  •   sql = "select * from gzh_article where title = '%s'"
  •   data = (name,)
  •   count = cursor.execute(sql % data)
  •   conn.commit()
  •   if(count!=0):
  •     return false
  •   else:
  •     return true
  • #插入数据
  • def insertdata(title,picture,author,content):
  •   sql = "insert into gzh_article (title,picture,author,content) values ('%s', '%s','%s', '%s')"
  •   data = (title,picture,author,content)
  •   cursor.execute(sql % data)
  •   conn.commit()
  •   print("插入一条数据")
  •   return
  •   
  • for row in effect_row:
  •   newsurl = 'https://weixin.sogou.com/weixin?type=1&s_from=input&query=' + row[1] + '&ie=utf8&_sug_=n&_sug_type_='
  •   res = requests.get(newsurl,headers=headers)
  •   res.encoding = 'utf-8'
  •   soup = beautifulsoup(res.text,'html.parser')
  •   url = 'https://weixin.sogou.com' + soup.select('.tit a')[0]['href']
  •   res2 = requests.get(url,headers=headers)
  •   res2.encoding = 'utf-8'
  •   soup2 = beautifulsoup(res2.text,'html.parser')
  •   pattern = re.compile(r"url \+= '(.*?)';", re.multiline | re.dotall)
  •   script = soup2.find("script")
  •   url2 = pattern.search(script.text).group(1)
  •   res3 = requests.get(url2,headers=headers)
  •   res3.encoding = 'utf-8'
  •   soup3 = beautifulsoup(res3.text,'html.parser')
  •   print()
  •   pattern2 = re.compile(r"var msglist = (.*?);$", re.multiline | re.dotall)
  •   script2 = soup3.find("script", text=pattern2)
  •   s2 = json.loads(pattern2.search(script2.text).group(1))
  •   #等待10s
  •   time.sleep(10)
  •   
  •   for news in s2["list"]:
  •     articleurl = "https://mp.weixin.qq.com"+news["app_msg_ext_info"]["content_url"]
  •     articleurl = articleurl.replace('&','&')
  •     res4 = requests.get(articleurl,headers=headers)
  •     res4.encoding = 'utf-8'
  •     soup4 = beautifulsoup(res4.text,'html.parser')
  •     if(checkdata(news["app_msg_ext_info"]["title"])):
  •       insertdata(news["app_msg_ext_info"]["title"],news["app_msg_ext_info"]["cover"],news["app_msg_ext_info"]["author"],pymysql.escape_string(str(soup4)))
  •     count += 1
  •     #等待5s
  •     time.sleep(10)
  •     for news2 in news["app_msg_ext_info"]["multi_app_msg_item_list"]:
  •       articleurl2 = "https://mp.weixin.qq.com"+news2["content_url"]
  •       articleurl2 = articleurl2.replace('&','&')
  •       res5 = requests.get(articleurl2,headers=headers)
  •       res5.encoding = 'utf-8'
  •       soup5 = beautifulsoup(res5.text,'html.parser')
  •       if(checkdata(news2["title"])):
  •         insertdata(news2["title"],news2["cover"],news2["author"],pymysql.escape_string(str(soup5)))
  •       count += 1
  •       #等待10s
  •       time.sleep(10)
  • cursor.close()
  • conn.close()
  • print("操作完成")
  • 以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持开心学习网。

    原文链接:https://blog.csdn.net/a2398936046/article/details/88814078

    上一篇下一篇

    猜您喜欢

    热门推荐