python爬虫开源代码（Python实现的文轩网爬虫完整示例）

时间：2021-10-02 01:03:31类别：脚本大全

python爬虫开源代码

Python实现的文轩网爬虫完整示例

本文实例讲述了python实现的文轩网爬虫。分享给大家供大家参考，具体如下：

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

encoding=utf8

import pymysql

import time

import sys

import requests

import os

#捕获错误

import traceback

import types

#将html实体化

import cgi

import warnings

reload(sys)

sys.setdefaultencoding('utf-8')

from pyquery import pyquery as pq

from lxml import etree

sys.setdefaultencoding('utf-8')

#屏蔽错误

warnings.filterwarnings("ignore")

#下载图片

def dowloadpic(imageurl,filepath):

r = requests.get(imageurl,timeout=60)

status=r.status_code

if status == 404:

return 404

with open(filepath, "wb") as code:

code.write(r.content)

#根据详情页地址抓取数据并插入数据库

def getdata(final_url):

file_open=open('./url.txt', 'w')

file_open.write(final_url)

file_open.close()

#链接数据库

conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='root', db='bookinfo', charset='utf8')

#设置浮标

cursor = conn.cursor(cursor=pymysql.cursors.dictcursor)

#解析详情页面

try:

detail_url=final_url

c=pq(detail_url)

head=c('html').attr('xmlns')

err='http://www.w3.org/1999/xhtml'

err1='http://www.winxuan.com/cms/2016db_sh'

if head == err or head == err1:

return 'back'

except exception, e:

return 'back'

i=0

while i<12:

text = c('#page').find('.cont').find('li').eq(i).text()

text=text.replace('　','')

if 'i s b n' in text:

isbn=text.replace('i s b n：','')

isbn=isbn.strip()

sel='select count(*) from bi_book where isbn ='+isbn

cursor.execute(sel)

result=cursor.fetchone()

count=result['count(*)']

if count != 0 :

print u'已存在'

return 'back'

if 'isbn：' in text :

isbn=text.replace('isbn：','')

isbn=isbn.strip()

sel='select count(*) from bi_book where isbn ='+isbn

cursor.execute(sel)

result=cursor.fetchone()

count=result['count(*)']

if count != 0 :

print u'已存在'

return 'back'

if '作者：' in text :

author = text.replace('作者：','')

if '出版社：' in text :

press_name=text.replace('出版社：','')

if '版次：' in text :

edition=text.replace('版次：','')

if '印次：' in text :

impressions=text.replace('印次：','')

if '装帧：' in text :

packaging=text.replace('装帧：','')

if '开本：' in text:

size=text.replace('开本：','')

if '出版时间：' in text:

press_time=text.replace('出版时间：','')

press_time=press_time.strip()

if press_time == '无':

press_time='1970-01-01'

if '印刷时间：' in text:

print_time=text.replace('印刷时间：','')

print_time=print_time.strip()

if print_time== '无':

print_time='1970-01-01'

if '页数：' in text:

page_num=text.replace('页数：','')

if '字数：' in text:

word_num=text.replace('字数：','')

i+=1

if ('author' in locals().keys()) == false:

author = ''

if ('press_time' in locals().keys()) == false:

press_time = '1970-01-01'

if ('print_time' in locals().keys()) == false:

print_time = '1970-01-01'

if ('impressions' in locals().keys()) == false:

impressions = ''

if ('edition' in locals().keys())== false:

edition = ''

if ('page_num' in locals().keys())== false:

page_num = ''

if ('word_num' in locals().keys())== false:

word_num = ''

if ('packaging' in locals().keys())== false:

packaging = ''

if ('size' in locals().keys())== false:

size = ''

if ('press_name' in locals().keys())== false:

press_name = ''

#暂无图片地址

none_img='http://static.winxuancdn.com/goods/sml_blank.jpg'

#获取大小图地址

big_path=c('.info-side').find('.img').find('a').find('img').attr('src')

if big_path is none:

return 'back'

elif big_path == none_img :

big_path=''

small_path=''

else :

small_path=big_path.replace('_16','_11')

#获取分类

#先获取a标签html

ahtml=c('#page').find('.base-nav').eq(0).html()

#解析a标签html

cate=pq(ahtml)

#获取分类的最后一个分类

category=cate('a:last').text()

#获取书名

name=c('.info-main').find('.name').eq(0).find('h1').eq(0).text()

name=name.strip()

#获取价格

price=c('.info-main').find('.attr').eq(0).find('.price-n').eq(0).find('b').text()

price=price.replace('¥','')

#循环获取内容简介和目录信息

k=5

while k<12:

title=c('#page').find('.title').eq(k).find('.tab').find('h4').text()

if '内容简介' in title:

con=c('#page').find('.title').eq(k).nextall()

det=pq(con)

content=det('.text-words-1').html()

content=content.encode("utf8", "ignore");

if '目录' in title:

con=c('#page').find('.title').eq(k).nextall()

dry=pq(con)

directory=dry('.text-words-1').html()

directory=directory.encode("utf8", "ignore");

k+=1

#如果内容简介和目录没有的时候指定为空字符串

if ('content' in locals().keys())== false:

content = ''

if ('directory' in locals().keys())== false:

directory = ''

details = '内容简介<br>'+content+'<br><br>目录<br>'+directory

details=cgi.escape(details)

#录入时间

add_time = time.strftime('%y-%m-%d',time.localtime(time.time()))

#下载小图

#文件根目录

root_path=sys.path[0]

#创建isbn文件夹路径

root_path=root_path.replace('\\','/')

isbn_path=root_path+'/download/'+isbn

if big_path != '' and small_path !='' :

#创建isbn目录

if os.path.isdir(isbn_path) ==false :

os.mkdir(isbn_path)

#组合下载后图片保存路径

down_img_small = isbn_path+"/small"+

i标签：Python 爬虫   
      
        
        
        上一篇下一篇
        
        猜您喜欢
        
        
          
              python之pil模块使用（Python3安装Pillow与PIL的方法）
              python 百度搜索结果（Python模拟百度自动输入搜索功能的实例）
              python序列定义（详解Python3序列赋值、序列解包）
              python numpy矩阵详解（基于Numpy.convolve使用Python实现滑动平均滤波的思路详解）
              python定义dataframe（对python dataframe逻辑取值的方法详解）
              python装饰器使用说明（详解Python装饰器）
              pythonmysql使用教程（Python异步操作MySQL示例使用aiomysql）
              thinkphp实战教程之博客技术学习（python3编写ThinkPHP命令执行Getshell的方法）
              python如何编写判断正负数程序（Python实现判断一个整数是否为回文数算法示例）
              python循环语句嵌套使用（Python分支语句与循环语句应用实例分析）
              这里输入关键词（怎么输入关键词搜索）
              34岁的舒畅，就这样走到了末路，不知会不会后悔15年前的草率决定（就这样走到了末路）
              不走心的古装造型 舒畅 毁容式 出演，萧蔷雷出新高度（不走心的古装造型）
              嘉南传 第22集（嘉南传第22集）
              哪版孙悟空最萌 黄渤躺萌了（哪版孙悟空最萌）
              融入小人物的喜怒哀乐，黄渤饰演的角色为什么让人观看时欲罢不能（融入小人物的喜怒哀乐）
              
       
        
         热门推荐
           
            
                1面试时自我介绍怎么说
                2angular组件化（详解Angular父子组件通讯）
                3sqlserver游标使用场景（解析SQL Server聚焦移除Bookmark Lookup、RID Lookup、Key Lookup）
                4vue项目有element插件（Vue Element前端应用开发之前端API接口的封装）
                5mysqlmha架构图（MySQL之MHA高可用配置及故障切换实现详细部署步骤）
                6elasticsearch启动报错（解决Docker启动Elasticsearch7.x报错的问题）
                7css中div高度自适应
                8已授权和未授权（提示您未被授权查看该页怎么解决？）
                
      
    

© 2021 开心学习