Python + pyspider某小说站的爬虫,入数据库,火车头发布,资源下载到本地,另可写爬虫!
9 [; S4 x" A9 f- #!/usr/bin/env python, q# H; `% U" C; C
- # -*- encoding: utf-8 -*-
/ `+ E2 t* }8 O0 O - # Created on 2019-05-05 21:43:11
9 P9 H* K G5 o# e' m& g - # Project: XiaoShuo
2 r! X& q' k9 ^: D7 u1 k - 3 z0 B- ]4 w W3 M
- from pyspider.libs.base_handler import *
7 ^" I' H. u ?8 v; r - import pymysql# d1 h6 N, p% T# |' w
- import random# ?5 I, | i/ P$ m- }
- import datetime
+ N0 L! z6 q, V- P - import urllib2,HTMLParser,re
0 Y+ H& g1 k+ u+ h1 _ - import os5 f K# q6 x& k1 |
- import sys( }) ~- s& C" |1 W4 e! Y2 n
- import re
, |' ]8 ?' Q' r( Y) J - import codecs
8 f/ W( a: R5 C1 o, [ - import requests
* }' s! W/ o% a s7 {4 W - import json
7 a, p) B! | i7 e2 n, [. n' j -
. M1 g( u/ c* s - class Handler(BaseHandler):
/ ~1 a) w5 t( a8 g( @- q+ e - global Datos3 |9 K* k4 p: o0 Q. G) n: a6 v& N
- global P_dir
/ J: C f7 G; j - P_dir = '/Tools/Debug/' #采集时候图片保持到本地的路径1 w a# N: T& E# m+ _# A, z
- global Datos: d* j# S. b, I5 D9 u
- Datos = {}
6 V( D: e/ N* `9 m4 t - headers= {
7 V' v3 X# [/ I9 a5 w - 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', j9 ^' R; b; M
- 'Accept-Encoding':'gzip, deflate, sdch',
; s5 P+ S4 ^) C6 W - 'Accept-Language':'zh-CN,zh;q=0.8',
# K# R( c( H& O# R8 E' m - 'Cache-Control':'max-age=0',
' m0 t( _% y" |/ B: @9 M( ^ - 'Connection':'keep-alive',9 i2 w( l9 c" r! e
- 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'
9 Z3 H, \4 i5 k6 J1 Q& i - }
+ p, {0 L% K. u6 F8 | - crawl_config = {
' p8 f G5 `. x2 d/ o' P, | - 'headers' : headers,
8 B% u* p7 d* n8 a - 'timeout' : 3008 e, @3 }: I/ O1 w u
- }
4 E2 m5 g( @; ]' U# y6 P! D2 Y7 A- t" v - def add_BookFile(self,Bookname, BookIDs, img, Locaimg, Book_Dates):. s3 v+ n( r2 l# v" K9 I7 Y
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
& V( W4 H$ f6 ?1 R5 I8 U - try:2 }$ y7 ?! n/ w8 ~# K8 Y' L2 v
- cursor = db.cursor()
! H9 Y- j" t8 h7 L* [ - #注意此处字符串的占位符要加双引号"%s"
' U) @+ z) y# n$ Z - sql = 'insert into BookFile(Bookname, BookID, img, Locaimg, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, BookIDs, img, Locaimg, Book_Dates);, v: l0 c1 O1 H
- # print(sql)
" h: m* D# G$ f3 c - cursor.execute(sql)/ J( p) C3 V) w2 W8 E& B5 ]; V
- * B' \+ @* G" r; V E! L2 ?8 [) ]
- #qid = cursor.lastrowid" Z2 i; p8 p# w4 j6 D( I6 M
- #print(qid)5 J, h2 j3 W! a6 f
- ! D$ ]; ]$ k" l, N# u
- db.commit()5 R8 g* c% G* o _$ y
- except Exception as err:# P5 c5 M, \& l& C6 [4 _
- print("Error %s for execute sql: %s" % (err, sql))
3 n& |1 ~4 O" \: C+ ^& ^7 \. d - db.rollback()
* @. x, @8 [# k( v9 B0 a* d5 o - def add_comment(self,Bookname, Booktitle, BookID, Titleid, Book_Date):
& g: x) H$ C4 W% b- f - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")2 d- v% r& O; n* ^- d3 Q# w- b( D+ w
- try:# C/ m H; a% p8 n4 d
- cursor = db.cursor()$ k1 d e. u& {5 p1 P! D
- #注意此处字符串的占位符要加双引号"%s"
2 o* F& g/ V% x" U - sql = 'insert into BookTitle(Bookname, Booktitle, BookID, Titleid, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, Booktitle, BookID, Titleid, Book_Date);
9 R" J3 E" N( n+ _5 W- T. \( Q& t - # print(sql)! f `9 h2 \8 N% z7 X
- cursor.execute(sql)' k' b' C& S3 s5 v
-
& g- T& J+ y& A- C/ z - #qid = cursor.lastrowid, y% u I' `6 P: `
- #print(qid)
8 F: P8 ?; l* f -
& [3 W( P, d7 v' h - db.commit()7 S0 E' B: Q" g/ X
- except Exception as err:
2 Y+ n' s! N. B1 s% w+ J: D - print("Error %s for execute sql: %s" % (err, sql))0 i+ m+ E: M) k- [6 z8 t
- db.rollback()
, k4 b/ E# I S6 s' j - def add_question(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date):
. A8 v' Q# ^2 a1 q5 X - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
+ L+ ^3 m/ M6 k) a) u - try:
# X$ ?5 f4 _5 [# a- V - cursor = db.cursor()" ]. \0 ]$ N l1 e/ X9 n; B! h
- #注意此处字符串的占位符要加双引号"%s"- J/ Z% M$ @$ R, t6 B
- sql = ' INSERT INTO BookConte (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date);
/ L" Z4 x; k3 R- c - print(sql)
. I6 [3 L3 X4 \1 _ - cursor.execute(sql)
/ [" B8 ^( A4 S* o+ B - print(cursor.lastrowid)- ]8 |, z! W2 _4 f/ R
- db.commit()& Q. e- q+ S7 }6 o$ ?
- except Exception as err:2 j; b; | W; A' r: |! n) R
- # except:: a' K! C: Z) t; ~9 b! H! U
- # print('Failed')1 W9 R/ g$ c4 ~* D3 C
- print("Error %s for execute sql: %s" % (err, sql))
* P/ O: d# |# X* L- L" t# w" i - db.rollback()
2 S4 U+ ?& o, V9 o - 9 A( f$ z0 q. _
- def add_locoy(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover):
. z+ v" _8 T1 L# Q* g - reload(sys)) B- T5 }1 i# J: T
- sys.setdefaultencoding("gbk")
2 w- m" d" `6 g" h - locoy_url = 'http://www.******.net/locoy/?my=book' #697火车头发接口地址
' M( f k9 y# Y- v - locoy_data = {
. Q' G- F* m2 D! U$ `0 [; I% r+ R - 'my_u':'用户名', #后台用户名
) G* `, R0 n5 X& i1 T - 'my_p':'密码', #后台密码
" } E' Q5 e' K# d/ u - 'subject_669977_net':Bookname.encode('gbk', 'ignore'),$ J' z% a# }. O$ C6 k5 @: F
- 'caid':Cater_Name.encode('gbk', 'ignore'),
+ t+ p: u1 c6 v4 T; V6 }: ^ - 'title_669977_net':Booktitle.encode('gbk', 'ignore'),2 f$ W$ D2 t. V0 T$ A+ ^% L
- 'article':BookConte.encode('gbk', 'ignore')," U6 j7 S& w+ J/ Y( u0 b' m
- 'author':Book_author.encode('gbk', 'ignore'),# J8 l4 L8 x6 |* ~/ Z( p5 [$ }
- 'ready_1':Book_Palabras.encode('gbk', 'ignore'),
0 @4 I1 X, I0 [ - 'thumb':Book_img, N" T' j4 E6 b( j+ t) _, L
- 'content':Book_Introduction.encode('gbk', 'ignore'),
" j }, I5 R0 V( \, w) ^& L - 'abover':abover.encode('gbk', 'ignore')
) J2 Q$ X) {3 B0 J) V9 r6 E: o - }# j7 t2 D1 z: i7 Z/ N4 G2 [
- res = requests.post(locoy_url, data=locoy_data)
; v$ P8 O' `0 K2 g - print res.text
' w3 E3 }) A/ u - print res.content0 @0 H1 V0 R0 x- k
- # print Dsd
0 B. [8 e' @+ L3 L. v4 i3 N' c - return res
) ^5 u5 Q/ `, U5 i -
: P) r* i- ?/ a- } - def __init__(self):
% V8 \0 K. R& p6 J# g - self.base_url1 = 'https://www.****.cc/'
$ C% a0 H! K8 A6 s% A - self.base_url2 = '/'
6 L! Q) `% v5 ` - self.CaterId = []
; k: x2 Z. C3 X, H2 ?, i2 z - self.CaterIds = ['xuanhuan', 'wuxia', 'yanqing', 'lishi', 'kehuan', 'kongbu', 'nvsheng']2 ^" s& G5 W0 V6 {( c y0 p0 U0 S
- self.page_num = 1
1 o+ P- T9 @- ?" s) E - self.total_num = 200
3 z! M0 k4 e4 P" [& F8 |- X) v - 6 T/ K7 ~% ]: ]
- @every(minutes=8 * 60)2 B. R0 p, b4 u. q3 [
- def on_start(self):# v5 b5 d7 G I( k" _; y* M' O
- global Cater_Name
- U5 {4 K* Z2 k6 W1 d3 w* F - Cater_Name = []0 g( e- y5 W/ J9 W( U4 }( z: Z' h
- while self.page_num <= self.total_num:
. [) M; m' t" C$ p - for self.CaterId in self.CaterIds:
6 m+ z" q$ N9 P6 j6 P. m9 `2 ? - if self.CaterId == 'xuanhuan':9 G3 C# z' l$ E% `1 m9 ] ?. O
- Cater_Name = '玄幻'
$ r( G* q6 Q+ }' f9 S1 P6 |6 f - if self.CaterId == 'wuxia':
! i; F# H% k$ p) Y - Cater_Name = '武侠'' |7 D+ j- N6 A$ `
- if self.CaterId == 'lishi':
4 ^" S4 F; w* |, T7 q1 }+ H - Cater_Name = '历史'
: J* _9 w# N1 d+ t - if self.CaterId == 'yanqing':2 a8 e' N. W: q& v1 w) u
- Cater_Name = '都市'
+ S; Z& |5 _, [( t t3 q2 V% W - if self.CaterId == 'nvsheng':1 Q# x5 \/ m, s7 ?$ q1 n- n2 K
- Cater_Name = '都市' , P4 v# a! I* i( m; ^- ~ `
- if self.CaterId == 'kehuan':
0 i7 h- M3 z& V. G& d* W8 u* I& a6 V - Cater_Name = '科幻'
. h. E; ?7 `# \: M1 k - if self.CaterId == 'kongbu':
4 h! M. P9 ~. L+ t# J) I - Cater_Name = '游戏' , d- u$ X# b9 Y8 ] t: Y
- print self.CaterId
4 G& m- b) D( V# f b) E. L! S1 { - url = self.base_url1 + str(self.CaterId) + self.base_url2 + str(self.page_num) + "/" 6 ~5 E1 [0 }0 |5 o: Y
- self.crawl(url, callback=self.list_Caterg,save=Cater_Name)6 B1 m& r3 k; K- H3 G1 {. i
- self.page_num += 1
5 _7 \1 a6 I! `, i9 h$ |, Z- P7 { -
6 z+ K/ x+ a+ f" h# w( y - def list_Caterg(self, response):
* ~$ F% ~, s( i! G - Cater_Name = response.save9 V' X: s Z9 B, f
- for each in response.doc('.pic-list a[href^="http"]').items():9 E2 f7 G* M5 C
- self.crawl(each.attr.href, callback=self.list_Caterg_detail,save=Cater_Name)2 O X8 `4 u* W) k5 {+ Z, a4 _
-
! T7 O0 J# j3 P! e/ c - def list_Caterg_detail(self, response):
0 S6 I+ Z I v) x/ ?* c5 G - Cater_Name = response.save& v1 ^1 T, p- J9 [ y& ]0 W& |) F
- # print Cater_Name
5 c6 {' P0 t* p+ L+ I/ b - Bookname = response.doc('h1').text() o& I, R/ d: \9 W6 f" M; Y3 D( @
- print Bookname
' x* b2 J3 Q3 X% z7 h - Book_author = response.doc('.authorname > a').text()
# B B' G' _) c7 P - # print Book_author
. |) b2 ?9 C, G4 c* f% b1 o - Book_Introduction = response.doc('.book-intro > div').text()
0 x2 n; R, K% }* G. O" L$ O+ a - # print Book_Introduction
4 z# b, f" ^3 D$ t$ l$ B7 s9 @7 |5 Y$ a - Book_Synopsis = response.doc('b').eq(1).text()
! Q) e; i$ H" G* M - # print Book_Synopsis) s& ` C. ]9 p7 F
- Book_Palabras = response.doc('.booktitle p').text().split(' ')[1].split('|')[0]
. Q. L& K) U1 V# D7 v3 i# U - # print Book_Palabras
& U! q7 B7 h( l" J' f - BookIDs = response.url.split("xiaoshuo/")[-1].split("/")[0] #小说ID. Z1 ^" l+ }5 I
- # print BookIDs8 H1 Z& _+ V+ s. F) ^7 f) _
- Book_Dates = str(datetime.datetime.now())
+ I& X# F; T7 y+ V$ v' J - for imgs in response.doc('.bigpic > img[src^="http"]').items():
9 |- P4 A6 H% u4 t+ y% { - img = imgs.attr.src
* r1 d% F# R+ w# K+ ?& n. c - print img
. Z4 c, m$ n+ N4 D* `* y - #小说封面下载
7 \, ?2 @8 @' _. y3 f$ Q8 r2 b/ W - extension = self.getExtension(img)! Q. d; S: B& Z1 V5 K7 L
- name = self.getname(img)% U. d2 {" T" y; `) K6 q
- file_name = name + "." + extension9 F! J0 ^) P. v; ?
- imgDir = P_dir + name
+ R2 V8 ?$ {( q2 @$ w7 n9 c7 g - Locaimg = imgDir + "/" + file_name0 U0 g+ x1 r0 @2 _: v& {# J: |7 ?
- print Locaimg+ F- i1 X$ g s* }5 X( {
- if(self.download(P_dir, imgDir, file_name, img)): #这2行可注译,图片下载到本地
9 L2 u6 w& B* J# Y1 q - print('attachment url is ' + img) #
T. r1 j! j6 h! h. _' X Y - Datos = { U% t5 V, m) f
- "Cater_Name":Cater_Name,
/ U8 r, V+ ^) G5 p - "Book_author":Book_author,- |! h: n" ?+ H$ B# U! v; U
- "Book_Introduction":Book_Introduction,. c9 w2 S' w2 `" x3 W. k
- "Book_Synopsis":Book_Synopsis,( H+ Z8 }7 w5 s$ ` k
- "Book_Palabras":Book_Palabras,
; B8 X4 ?: K( p - "img":img,
$ V Q7 t$ \. L$ O$ u* y( D1 V - }1 H1 p; ~7 P9 h" a8 T# W
- self.add_BookFile(Bookname, BookIDs, img, Locaimg, Book_Dates) #这行可注译,数据库发布接口,方便其他系统的发布+ i+ k. L: H+ h; q- ]1 X
- for each in response.doc('div[class="bookbtn-txt"] a[class="catalogbtn"]').items():
6 N6 ]& ?3 Y2 b# t# V - self.crawl(each.attr.href, callback=self.index_page,save=Datos)9 s/ f7 U: I4 B2 b4 R
-
X7 y3 k$ |7 v/ I% Y - @config(age=8 * 60 * 60) $ u0 {9 u0 r/ G+ e! X2 _
- def index_page(self, response): / a- }' G, l9 m, `! [9 t
- Datos = {/ M% q1 J ? t
- "Cater_Name":response.save['Cater_Name'],
" J# V8 m. _5 G0 ^- [# z5 c - "Book_author":response.save['Book_author'],
0 G* V: ?& Y( e- \5 J - "Book_Introduction":response.save['Book_Introduction'],
! _2 c# v) f: U1 N - "Book_Synopsis":response.save['Book_Synopsis'],0 j f! c0 S3 X7 P2 l
- "Book_Palabras":response.save['Book_Palabras'],# x( T. Z+ N* ^, F/ _- s6 |
- "img":response.save['img'],
. D! p2 A1 e* h* M" i' Z7 K# N - }4 N( a! B1 }$ l- ~% \, p6 S1 O$ u3 L( @
- for each in response.doc('.chapter-list li:first-child a[href^="http"]').items():' V, ]" [, f( o: z1 C
- # for each in response.doc('.chapter-list a[href^="http"]').items():
7 p9 z& Y7 `3 t8 G+ v. O% K$ O - self.crawl(each.attr.href, callback=self.detail_page,save=Datos)
7 y& Q4 c4 N' ^6 ?) E7 A0 d - @config(priority=2)
- C5 Q0 J$ I3 Y6 [, A+ R1 X: m& z. Q4 l0 m - @catch_status_code_error
8 X7 ~0 h+ A+ W! w( N2 V _) A - def detail_page(self, response): 2 q! I2 d+ I' V. A; G* o
- NewRe1 = u'哈书'
; i1 W3 D2 K @! \ - NewRe2 = u'huhjsd.CC'" F, r$ d. D/ W6 F# T2 M
- NewRe3 = r'^\\n\\n'
- C2 Z- J* |% i. _% t; X# o6 O - NewRe5 = u'小说网'
X; N+ _) Q* r1 a Y - NewRe6 = u'fgdfgf'
3 O4 U0 q6 W3 \2 F5 V- ?/ a1 m - NewRe7 = u'fgfgf'
+ S7 L9 \4 ]9 f2 P/ R7 A - NewRe8 = u'ffhgf'
* t* O9 Q# O. K" `; T& A - NewRe4 = r'[\f\t\v+\.\{\(\)\}\!\/_,$%^*(+"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+'
1 W% I6 t" `3 t2 y$ V" k* _ - ReC1 = u'静思'' k6 o5 r3 @& { j* U) l: ^0 ~- ?
- ReC2 = u'aghgf.com' \. t: F' _2 q# X' I2 l4 A$ V
- ReC3 = u'aghgfh.com') k# z3 Z( g6 v3 `9 q6 D4 X: U) `
- ReC4 = u'') X8 w4 \4 ] J/ }" o7 ~! P
- ReC5 = u'文学网'" r7 e7 g, Y2 |; n
- ReC6 = r'<BR>'7 l& J0 q4 Y5 x9 R- ?# f, D/ o
- Bookname = response.doc('.readlocation a').eq(2).text() #小说名称
5 M" A5 w4 F6 W - print Bookname
2 y0 l: a" k& Y( @- t- h - Cater_Name = response.save['Cater_Name'] # 小说分类. T* P$ V, A1 e' \% E6 u
- Book_author = response.save['Book_author'] #小说作者8 j V# h8 D$ |. D q3 A
- Book_Introduction1 = response.save['Book_Introduction'] #小说简介
) o5 X; G) A; [. B' w/ E" o# g7 ~ - Book_Synopsis = response.save['Book_Synopsis'] #最近更新
: n6 `7 p T) b: n; _ - Book_Palabras = response.save['Book_Palabras'] #小说字数2 M% Y* P" }7 {, }( E& ^$ b+ a
- Bookurl = response.url #小说网址$ M; C5 U, G$ ~0 a* m
- Booktitle = response.doc('.article-title').text() #章节名称5 c2 K- f0 P, T- c2 W$ S
- BookID = response.doc('.readset-r span').text() #小说ID! J9 s- b2 Y/ \7 G/ ~% N% k
- BookConte1 = response.doc('.article-con').text() #小说章节内容8 N! _% b! d% x2 G9 n7 O
- abover = response.doc('.article-title').text() + response.save['Book_Synopsis'] + response.save['Book_Palabras'] + response.save['Book_Introduction'] #小说状态(连载还是完成)
9 [; M; s; i! i+ `0 v9 W - Book_Date = str(datetime.datetime.now()) # 采集时间
& i& O- F- ?6 n3 h" z7 W - BookConte2 = BookConte1.replace(NewRe1 , ReC1)
) _' ^! _. }. x% L+ _ - BookConte3 = BookConte2.replace(NewRe2 , ReC2)
+ Y3 A& P5 U/ \; D - BookConte5 = BookConte3.replace(NewRe5 , ReC5)% m3 b6 _8 o" M" W1 G" f# F) h
- BookConte6 = BookConte5.replace(NewRe6 , ReC2), }* \% B8 i$ L' I4 Z# ?. }
- BookConte7 = BookConte6.replace(NewRe7 , ReC2)
6 e5 C- j5 ~9 T6 g0 k - BookConte8 = BookConte7.replace(NewRe3 , ReC6)
: b7 g& G9 C$ Y3 j6 _ - BookConte4 = re.sub(NewRe4 , ReC4 , BookConte8)
, Y! P3 Z9 c5 Z, H9 n - BookConte = BookConte4.replace("\n\n","<br>")! s* m, L/ N X- Q
- print BookConte
1 ~- [- @2 s+ z/ m$ ^; n - Book_Introduction2 = Book_Introduction1.replace(NewRe1 , ReC1)
: L7 h' Q. Y% `, \& r0 y - Book_Introduction3 = Book_Introduction2.replace(NewRe2 , ReC2)
3 n! ]( K* e! k" E" O$ g# a& {$ R3 H/ I - Book_Introduction4 = Book_Introduction3.replace(NewRe3 , ReC3)
0 d5 ~' W \0 l5 P& p, V7 j/ { - Book_Introduction = re.sub(NewRe4 , ReC4 , Book_Introduction4)9 H' w. ~$ [0 J. V8 [0 Y
- Titleid = response.url.split(BookID + "/")[-1].split("/")[0] + [# X" C, P3 V# z+ V
- Book_img = response.save['img'], #小说图片7 ^; ^3 i3 a! |8 e' O1 e( t
-
, D4 r1 f Q& z6 a6 W - #insert into MySQL 小说入库8 D: K+ ]3 [- @) L3 s1 p% x
- self.add_question(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布2 h/ }" b0 C1 L1 w* d
- self.add_comment(Bookname, Booktitle, BookID, Titleid, Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布
4 |, x( |9 X( s2 \/ R, r7 ?- t - #post提交发布! a* a" C( N% U0 x
- self.add_locoy(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover) #这行可注译,火车头发布接口,不需要可取消
, \, l! g5 Q: e7 o - Datos = {
3 O w) T7 k4 G } - "Cater_Name":response.save['Cater_Name'],! U# {5 {2 h/ G2 o6 e
- "Book_author":response.save['Book_author'],
! u/ {, |* w/ A& i: g$ Z - "Book_Introduction":response.save['Book_Introduction'],
6 I/ i! t/ Q7 \( q6 n" w - "Book_Synopsis":response.save['Book_Synopsis'],) L8 a. c& ?8 p) ?, d- {0 R% t
- "Book_Palabras":response.save['Book_Palabras'],8 ?: M* r$ B) R: R9 m6 g
- "img":response.save['img'],
4 s% M* D+ v: b - }
3 Z9 K7 S s1 j$ A9 [ - for each in response.doc('.articlebtn > a:nth-child(4)[href*="/xiaoshuo"]').items():3 f( p2 ]4 K* r8 w
- self.crawl(each.attr.href, callback=self.detail_page,save=Datos) ' L5 l' e$ Q- |+ ?/ m
- return {# A- z) f1 c9 |' f3 s
- "Cater_Name":Cater_Name,: W" O- _5 `8 r& S, F
- "Bookname":Bookname,2 Y) ^: A* X8 U2 y$ T
- "Book_author":Book_author,( j8 u F9 `2 [. e* ~7 Q. ~, n
- "Book_Introduction":Book_Introduction,9 M+ f. [5 P k6 V% F
- "Book_Synopsis":Book_Synopsis,
. i2 \+ o# I/ M, T* d3 ^/ \ - "Book_Palabras":Book_Palabras,! ~' @1 C: }. W
- "Book_img":Book_img,) Y% y# L. D) T& I. x" O7 {
- "Bookurl": response.url,
1 c# C6 n" y6 B - "Booktitle": Booktitle,
0 Q) Q3 p" \4 d o3 q2 \9 T( p - "BookID": BookID,6 G; g0 }2 i7 N* f) F
- "BookConte": BookConte,
2 ^- A* k6 s5 s3 l1 ^5 b% h+ T - "Titleid": Titleid,
! D0 _7 E i/ i - "abover":abover,
) ?1 L# c( `$ K - # "Book_Date" = str(datetime.datetime.now()),
$ k% |/ @" ^6 e: Q7 t" M! h7 g% W - }
/ V2 o) y& B; j; ?5 p - def download(self, P_dir, imgDir, file_name, Book_img):' G4 l9 {4 {( B! j" d" B$ b: E1 n
- if not os.path.exists(imgDir):
4 W# y$ } `- [8 H4 _ - os.makedirs(imgDir), t" b9 x% g0 y$ L: n. V. m3 E) [! b
- file = imgDir + "/" + file_name
! h: e w' t# { - # print file
! T$ I$ H6 E f; w# j8 S; } - f = open(file, 'wb+')- W5 k* e, `- m: L+ ? O6 ?- U" x
- imag = requests.get(Book_img)
( t8 ?! J5 B4 L7 d/ S$ v& p - f.write(imag.content)
0 O& \; d# K' \" {5 i0 b+ T. D - f.close()
2 p: ~2 M6 z& z - #保存图片前4 ?$ L6 X2 v+ ]* A
- def save_imgs(self,response):; e2 b; E1 ^0 G2 M! S( v
- content = response.content
" k7 E" d8 g% d# |* J8 A - file_name = response.save["file_name"]
; l' f; ]/ D2 n8 h% t - imgDir = response.save["imgDir"]0 x3 ?3 i6 y6 Z2 Q+ L
- file_path = imgDir + file_name5 W) h0 S/ W- S; \5 q0 W
- self.save_img(content,imgDir,file_path), q( d( F6 M' g" T7 Y3 O
- #保存图片
8 f' k) j: [% d' x8 h - def save_img(self,content,imgDir,path):
; s- z6 Z- C7 _1 V" b6 M* a - if not os.path.exists(imgDir): % ?" w/ G Z; n2 U8 g; H) w
- os.makedirs(imgDir) b6 W( e6 @% N+ \" b
- f = open(path,"wb" )9 G" k8 u4 v' G: |+ B0 e4 o0 _3 s) E
- f.write(content); V- X( C5 r# @
- f.close()3 h: H. o7 d2 T
- #获取url后缀名, l% {' J, U& n8 G
- def getExtension(self,url):
: ~2 w T* B; X$ T - extension = url.split(".")[-1]/ Y$ S% ^: x4 t5 V h! R& `
- return extension
1 j, o$ l, h1 O1 c0 i* u - * [% ?3 C h3 M6 ]! X
- #获取图片名
1 v) y6 x, ~- R; I9 \: ^) A - def getname(self,url):/ A! P- A9 f2 G2 A2 x" c) X
- name=url.split("/")[-1].split(".")[0]. j! }! X7 n# \. h/ e3 k, i
- return name
复制代码
7 X6 Z% `8 L$ b l+ b6 r y- ]% E$ Q4 J& x' V
|