Python + pyspider某小说站的爬虫,入数据库,火车头发布,资源下载到本地,另可写爬虫!
7 d: Z2 G2 a6 X# |- #!/usr/bin/env python
) n3 j6 p6 d% G6 W' q4 G, j' J. i. J2 s - # -*- encoding: utf-8 -*-
( O! |/ G5 L! L" E) q. u7 H - # Created on 2019-05-05 21:43:11
+ @: O- [+ T9 z- Y - # Project: XiaoShuo) P9 i7 d& j- D8 U d5 h4 @' v6 _
-
7 M. a: V$ G! x8 N' \2 O/ ` - from pyspider.libs.base_handler import *
/ ]& l8 a* u" ^. D% z - import pymysql7 Q5 ]9 E" V$ M8 v; o
- import random4 L$ _, S" x0 \& z7 L. a; d
- import datetime% r! a; d* z: ~3 A
- import urllib2,HTMLParser,re
. M' u/ I; O' `: K/ |+ d - import os a8 G5 F9 f, o/ }! x5 x4 _6 E
- import sys
# y/ q5 I4 o4 C3 c1 @1 V; ^, ? - import re3 P6 @1 |8 G) m- D) W
- import codecs
. q* M* Q2 t- f+ F3 b C - import requests
" A) M) R1 v1 m5 X" D - import json
0 D0 `2 T" Y9 S1 \% h - ! T- P; s! s! }
- class Handler(BaseHandler):5 o, W _0 `( p- S" E
- global Datos
/ ?8 j, H8 P" X' z1 v$ N - global P_dir 0 ^+ d1 [3 b& v: J! j" Y7 A7 I
- P_dir = '/Tools/Debug/' #采集时候图片保持到本地的路径: r. Y6 P. R# Z& }$ H( f* l
- global Datos
- j \( x; }3 r) u - Datos = {}0 F5 s5 w7 `1 T4 K A6 C
- headers= {4 U3 q0 Y$ G9 ]% s% g) u* u! a
- 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
% A/ p y$ E% T' A3 Q7 f - 'Accept-Encoding':'gzip, deflate, sdch',( N* H8 h8 Q% F' _9 o
- 'Accept-Language':'zh-CN,zh;q=0.8',
# n a3 y' l1 T+ ?" X% _7 n - 'Cache-Control':'max-age=0',$ U' k5 ~) s( z6 v U* n7 i Y
- 'Connection':'keep-alive',% c4 ~- o9 @5 \" U
- 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'! x! Z- A" m% L2 t' g& _( J4 G
- }
' b- c* |7 V9 v+ t% q! i. \ - crawl_config = {
% M( K+ f" R# f7 O - 'headers' : headers,
2 C' } @+ e+ W8 ] R+ v8 n - 'timeout' : 300- B" ]+ u4 W5 N6 V" a; ^9 D
- }
; R: ]& W7 G9 y( P - def add_BookFile(self,Bookname, BookIDs, img, Locaimg, Book_Dates):
5 [ j- X' m. `+ [ - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
: e. W1 f+ @; y7 F - try:" c/ _7 Z7 y. |/ G e/ k
- cursor = db.cursor()
. L( a G4 n; D* ^% {" l - #注意此处字符串的占位符要加双引号"%s"
5 m' Y' f, V" s, ^5 y& N - sql = 'insert into BookFile(Bookname, BookID, img, Locaimg, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, BookIDs, img, Locaimg, Book_Dates);
2 C3 U( v- b* d# B - # print(sql)8 S6 }; Q7 S7 H# E2 l$ B
- cursor.execute(sql)8 g; D# ~: Z6 E" t4 e1 g- e
-
2 s: N& m9 z4 @- ^, _: x! f+ [ - #qid = cursor.lastrowid! ]) Q# V$ B: W0 ?) T1 [
- #print(qid)
4 W X+ G* K, \ - , W5 H2 F* G$ D5 ?( W
- db.commit(), k0 P" M9 Q W% [
- except Exception as err:
& d. e, h% d) @2 z8 E2 f5 y - print("Error %s for execute sql: %s" % (err, sql))' |% {% D: b# z u
- db.rollback()/ O$ }5 A+ O0 ?% n
- def add_comment(self,Bookname, Booktitle, BookID, Titleid, Book_Date):2 b, o. @, H, y. t' ?/ A# `, G
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")3 k8 H# t' \& u8 m
- try:
7 |" y5 J5 U7 [4 K - cursor = db.cursor()
, x' ?* ]6 X; O! n; U7 R - #注意此处字符串的占位符要加双引号"%s"
% S9 `- e# ?* `2 G. F - sql = 'insert into BookTitle(Bookname, Booktitle, BookID, Titleid, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, Booktitle, BookID, Titleid, Book_Date);
2 e- u3 p U$ q! L - # print(sql)
* a/ e( j8 g% a0 e9 @0 d - cursor.execute(sql)! l7 E. p' D& C, n/ ]7 h+ m* p
-
7 q8 R9 N& R0 @8 H# s/ M - #qid = cursor.lastrowid" }1 m; i1 N. ]6 [) ]
- #print(qid)
. ~% R( L" a. x* `: U) t -
8 H& q/ W7 ~6 N! c" f! r - db.commit()) l3 Q* b0 ]) D) M# w# Z4 M% `
- except Exception as err:
; R* z! K. ^6 [& R - print("Error %s for execute sql: %s" % (err, sql))
5 u% y( L' d+ Y - db.rollback()
1 H* B* j" _) B! t8 D% L) j% k - def add_question(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date):! @. j5 L# w( q. _# T9 S" L
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
2 s/ b; ?4 f+ [: U9 R' e0 g- G y - try:
, ?. X# t" E2 R: M: ]; P! R7 h - cursor = db.cursor()" R2 D* L1 ]3 C3 A; W
- #注意此处字符串的占位符要加双引号"%s"
" z! w1 h2 Z# E' c2 j - sql = ' INSERT INTO BookConte (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date);
5 W7 ~. R9 |) ^- u* q ^ - print(sql)
6 w5 b' j) [; T8 s% p* _6 t& Y - cursor.execute(sql)
9 z5 @+ U% h2 {) ` - print(cursor.lastrowid)
( X5 V7 a- H- c7 M8 k4 A1 W - db.commit()
* c1 x l* n& |$ B- I& E0 L - except Exception as err:
% O- P# `! C$ ~$ a( v3 b - # except:
! o3 G* u5 o$ t' }1 Z - # print('Failed')
( J! ?/ u( h4 H/ Q: C" B! t, B& Z - print("Error %s for execute sql: %s" % (err, sql))
4 E6 t/ {9 C* ^9 z* k - db.rollback()
" j9 W% Q4 i x' L4 o2 D - / ]0 @! Q o& l+ t
- def add_locoy(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover):
2 a/ m1 j4 i' j- d5 u - reload(sys): { t( o# k/ D' w5 y! o
- sys.setdefaultencoding("gbk")1 ]' \: r" Z6 k4 T. S+ v$ S `
- locoy_url = 'http://www.******.net/locoy/?my=book' #697火车头发接口地址
" T* x: s. y5 b" v# G. ^ - locoy_data = {
) ?( a/ Z0 Q* u! y5 h3 M( c - 'my_u':'用户名', #后台用户名7 A! _! j) k3 w9 x/ {- F/ |1 P
- 'my_p':'密码', #后台密码) b0 {' {; ~% _: _! A3 G
- 'subject_669977_net':Bookname.encode('gbk', 'ignore'),
7 |8 `$ g R0 M6 M6 Z9 p* c - 'caid':Cater_Name.encode('gbk', 'ignore'),
" y: O! E( S$ `/ h" V+ I5 `: E" b - 'title_669977_net':Booktitle.encode('gbk', 'ignore'),
* a$ f1 u; P, Z# L% k/ N - 'article':BookConte.encode('gbk', 'ignore'),+ y1 o& `- Q' ^1 x- a6 w
- 'author':Book_author.encode('gbk', 'ignore'),+ C$ F. {3 m" B* t; h! e
- 'ready_1':Book_Palabras.encode('gbk', 'ignore'),1 g+ h- C4 d$ o6 d% w* `6 j
- 'thumb':Book_img,
7 S4 c1 Q2 ?0 A% P2 I* | - 'content':Book_Introduction.encode('gbk', 'ignore'),& ]5 y; m4 x& [
- 'abover':abover.encode('gbk', 'ignore') . b& y8 j$ ~. F1 w1 `/ g% l6 d
- }! r6 I% K) w/ A& E2 ~, Z7 D
- res = requests.post(locoy_url, data=locoy_data)
/ E+ h) s8 W% p - print res.text3 |8 X. V" w5 ]4 d% {/ K2 f
- print res.content, W! V) M( r: {
- # print Dsd
3 X' b1 o, s& l' L7 P: Y% C - return res
) Z& x% n+ q( U+ o: O; q& W - * x9 I2 i+ W* Y& M# }& `; E% I+ T
- def __init__(self):, e+ S3 k* q* Q$ `% x3 |( |
- self.base_url1 = 'https://www.****.cc/'
, N2 L# T: {% R& H5 p# \& G - self.base_url2 = '/'
# O# Q, I+ y0 b% K# F( T+ ? - self.CaterId = []
5 @/ f$ N& \. Q. p% S5 C - self.CaterIds = ['xuanhuan', 'wuxia', 'yanqing', 'lishi', 'kehuan', 'kongbu', 'nvsheng']3 N+ z4 y& B5 |" X$ d. s k
- self.page_num = 1' v2 T; N r) u; x! H, n8 I
- self.total_num = 200 8 I1 _" Y8 o& Y2 i! X
-
6 R, l% Q1 h4 e3 l9 T4 S: O% r - @every(minutes=8 * 60)
3 Y/ f: @& j' l1 z - def on_start(self):
& }; D( O, \0 \, R; I/ a; u: D6 O - global Cater_Name
O4 h5 Y8 k9 ` - Cater_Name = []0 \ k9 l X) h8 a- y' @
- while self.page_num <= self.total_num:
# D/ |4 }: J+ E& J: m - for self.CaterId in self.CaterIds:1 S( U& \8 i9 ? y+ ?. t' _, p4 I
- if self.CaterId == 'xuanhuan':
. n- R6 d" m- S- L - Cater_Name = '玄幻'( ^/ V! o ]6 v5 S
- if self.CaterId == 'wuxia':
8 r) V: N# S: }7 z3 f% n - Cater_Name = '武侠'
6 r: p4 }9 t' J. O - if self.CaterId == 'lishi':* X3 A2 o& C% |/ @8 a: e1 y, ?
- Cater_Name = '历史'
) ^! C& a; X* K+ {* n8 a6 n$ `- w - if self.CaterId == 'yanqing':
- Q& A2 J( A2 Y. [/ P, w1 ?) K - Cater_Name = '都市' 6 ~' F# |9 E8 S& |
- if self.CaterId == 'nvsheng':9 w9 Y! o" k% G0 H6 ?' m }
- Cater_Name = '都市' ) E% W2 z5 i6 I$ f3 ?7 d
- if self.CaterId == 'kehuan':
p: P5 l6 U) U/ B - Cater_Name = '科幻'
: d x/ D' I1 a ` - if self.CaterId == 'kongbu':3 L0 n- s5 ?# ~ }1 [
- Cater_Name = '游戏'
6 f' E) J& W1 [' Z$ u - print self.CaterId
) E* Y h* ]! G( }7 g - url = self.base_url1 + str(self.CaterId) + self.base_url2 + str(self.page_num) + "/" 1 `" r& o5 V, f, Z( a
- self.crawl(url, callback=self.list_Caterg,save=Cater_Name)
5 `- A% M! F9 K; E. e0 j - self.page_num += 1
2 @, L8 n+ ]( _* D1 L8 Y8 W! I5 E - - J# \, Y3 Y# k/ [! S9 E6 B
- def list_Caterg(self, response):
" S; n& s( I- w( o - Cater_Name = response.save
2 n& ]4 |: F" Z/ e2 z* N2 ?; `+ u/ | - for each in response.doc('.pic-list a[href^="http"]').items():' v* J$ O0 n: u8 d
- self.crawl(each.attr.href, callback=self.list_Caterg_detail,save=Cater_Name)/ E9 j/ f" B9 Q. o7 p4 F5 ]& w
-
1 w' \6 k# h& A5 P# q - def list_Caterg_detail(self, response):
7 _" A' J* ?1 @7 L+ Q - Cater_Name = response.save
* ?" |4 o+ l; w6 C. G1 R9 H - # print Cater_Name. `3 |, N( {2 d
- Bookname = response.doc('h1').text()3 b" U% P0 W6 a- C. I
- print Bookname
' u- B/ c' L6 L* j$ r - Book_author = response.doc('.authorname > a').text(). M+ V& r* t0 D9 r5 X7 W5 ^
- # print Book_author
1 p3 ~" M# e! Y3 e - Book_Introduction = response.doc('.book-intro > div').text() W4 C+ s; l4 c* Q/ C* I% T }2 w
- # print Book_Introduction# i; x( {- j: O/ A9 J
- Book_Synopsis = response.doc('b').eq(1).text()! c5 @1 ~6 i4 f) U% f( X$ x Z
- # print Book_Synopsis; d3 N4 k, D( P& k2 Q
- Book_Palabras = response.doc('.booktitle p').text().split(' ')[1].split('|')[0]
" T# F" v: G$ y* y - # print Book_Palabras
" q7 w$ n; V, Y0 v - BookIDs = response.url.split("xiaoshuo/")[-1].split("/")[0] #小说ID
' r0 g" C; H+ i( Z - # print BookIDs
* m& U) m& z" A - Book_Dates = str(datetime.datetime.now()) 5 f( S2 j* y5 r
- for imgs in response.doc('.bigpic > img[src^="http"]').items():
6 c* U: y0 E0 f - img = imgs.attr.src
: P$ N- q8 u' V0 z$ ?, z( Y* M" o - print img6 `+ M2 o$ y( r6 y: r
- #小说封面下载/ U0 K1 s& [: d
- extension = self.getExtension(img)
- }' S1 l' D0 h7 G9 h - name = self.getname(img)$ X8 i! V+ K. y4 X0 k
- file_name = name + "." + extension$ G; ^- _: J8 e W
- imgDir = P_dir + name
9 [# C+ O& y& D* s* l: F' Q - Locaimg = imgDir + "/" + file_name
! _) U7 k( x3 ^% c: O* |/ X - print Locaimg, m( D! [- W$ V" o8 u+ H% _
- if(self.download(P_dir, imgDir, file_name, img)): #这2行可注译,图片下载到本地
$ m3 o8 v( G; L" M6 Y" Z! u y* _ - print('attachment url is ' + img) #: j) O; ^; t: b
- Datos = {/ r( a/ U) g s$ |+ L B( J
- "Cater_Name":Cater_Name,
" |& @; B' D5 y - "Book_author":Book_author,+ }' D& ~0 l2 U$ u) e
- "Book_Introduction":Book_Introduction,
& E; q% f% D- A4 F7 ` - "Book_Synopsis":Book_Synopsis,
5 g' l) D# n9 F. | Z( J# K - "Book_Palabras":Book_Palabras,1 i; m" K- |( N" ]0 |4 |$ y# Z3 }
- "img":img,
1 T* ]( w2 G1 f - }
" k4 T0 x: `3 O/ y! @1 b% i - self.add_BookFile(Bookname, BookIDs, img, Locaimg, Book_Dates) #这行可注译,数据库发布接口,方便其他系统的发布. r5 V2 O( V3 V; Y
- for each in response.doc('div[class="bookbtn-txt"] a[class="catalogbtn"]').items():
% X% V" x) a* ^ - self.crawl(each.attr.href, callback=self.index_page,save=Datos)4 K: Q7 t( ?" l
- / \5 @ G. p b) f- ?7 V+ D; a
- @config(age=8 * 60 * 60) 3 c; I9 V9 s# H5 [- ^
- def index_page(self, response): ; v, [3 o5 O+ D' [" F6 _/ q
- Datos = {9 S7 ?6 G1 Y7 A! ]: ]
- "Cater_Name":response.save['Cater_Name'], I3 x9 B2 M0 t3 S
- "Book_author":response.save['Book_author'],
8 R$ v/ \' @0 g; E - "Book_Introduction":response.save['Book_Introduction'],3 J: [4 y- ^$ d
- "Book_Synopsis":response.save['Book_Synopsis'],, u& ~1 h3 O9 H6 v& o4 l
- "Book_Palabras":response.save['Book_Palabras'],
% M" E4 l% ?- T0 Z0 [5 n- } - "img":response.save['img'],$ h1 M4 c% c ^# n7 w
- }
. ]& D$ `2 Z2 m6 R$ D5 i/ V } - for each in response.doc('.chapter-list li:first-child a[href^="http"]').items():
u$ L; t/ Q- b1 m0 o# W2 Y, _ - # for each in response.doc('.chapter-list a[href^="http"]').items(): . `$ f) N3 p9 s2 d
- self.crawl(each.attr.href, callback=self.detail_page,save=Datos)) [8 ^9 I8 J! F3 R- W0 [( i5 g: u: A7 u
- @config(priority=2)7 l2 H8 f6 T, v& c- S
- @catch_status_code_error
3 e) p) O( C' p' V9 z - def detail_page(self, response):
7 {; C3 X- }3 R9 V# f% x5 H' c9 A - NewRe1 = u'哈书'
C% O' y, j& N3 h: [2 C& x - NewRe2 = u'huhjsd.CC'
# w* M( [% Q& Y: Y; t2 D1 P3 Z! P1 ]$ Q M - NewRe3 = r'^\\n\\n'0 `( B* W/ D, V1 g- d) S
- NewRe5 = u'小说网'
: y; X# `; e( _9 ~6 y+ x - NewRe6 = u'fgdfgf'
; w) d% m: s+ S$ c1 G - NewRe7 = u'fgfgf'
- r; y( Z+ g T7 n% J* X2 r! S1 V5 K - NewRe8 = u'ffhgf'7 P8 Y! S9 r, b& Y/ M0 j
- NewRe4 = r'[\f\t\v+\.\{\(\)\}\!\/_,$%^*(+"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+'4 v. Z# A7 [, M
- ReC1 = u'静思'( o' b! \( R, z9 i; P
- ReC2 = u'aghgf.com'" C& g* u6 \* s; m' F* o
- ReC3 = u'aghgfh.com'
4 H) ?8 K( e+ M3 T2 u" C - ReC4 = u''2 @2 l. ~) t/ g/ h6 e
- ReC5 = u'文学网'
4 ]& _& N# u" Z - ReC6 = r'<BR>', J) j4 K# q' I! E( [9 n
- Bookname = response.doc('.readlocation a').eq(2).text() #小说名称
2 _) O. E: H% h& `3 A - print Bookname$ M: h2 q# u: j& h* i1 a% S' y, o" g8 x
- Cater_Name = response.save['Cater_Name'] # 小说分类8 {# M3 u+ D& F/ w; x2 K3 M3 H
- Book_author = response.save['Book_author'] #小说作者
: G9 \* O* y. I8 U* ^ - Book_Introduction1 = response.save['Book_Introduction'] #小说简介' F4 {! F4 N/ ]" L8 \: V
- Book_Synopsis = response.save['Book_Synopsis'] #最近更新% b! M+ f: Q5 s+ f
- Book_Palabras = response.save['Book_Palabras'] #小说字数
5 E7 v1 ^0 z* k - Bookurl = response.url #小说网址4 E$ w) m5 G1 f, U
- Booktitle = response.doc('.article-title').text() #章节名称! T7 p; n; R O- k3 I' `
- BookID = response.doc('.readset-r span').text() #小说ID1 Q. D; f" |8 }% u1 C. C1 u, d" n, I
- BookConte1 = response.doc('.article-con').text() #小说章节内容
, N9 Y" M# m0 {5 z* ^" ^ - abover = response.doc('.article-title').text() + response.save['Book_Synopsis'] + response.save['Book_Palabras'] + response.save['Book_Introduction'] #小说状态(连载还是完成)
8 l6 v. M4 v8 l2 Y - Book_Date = str(datetime.datetime.now()) # 采集时间* \2 r: u( f/ n, I
- BookConte2 = BookConte1.replace(NewRe1 , ReC1)9 U3 I+ \% _9 ]$ p- D% D
- BookConte3 = BookConte2.replace(NewRe2 , ReC2)
: x# E+ M( R* |! g; k; r! r - BookConte5 = BookConte3.replace(NewRe5 , ReC5)
5 W) F$ X6 j# k7 m - BookConte6 = BookConte5.replace(NewRe6 , ReC2)) V, U* x& l. q4 Y! C! ]) P
- BookConte7 = BookConte6.replace(NewRe7 , ReC2)
( h) \( y$ i5 k - BookConte8 = BookConte7.replace(NewRe3 , ReC6)9 W u# I% w$ @3 L e0 j
- BookConte4 = re.sub(NewRe4 , ReC4 , BookConte8)8 h" X" b9 g$ `% h' ~' A1 G
- BookConte = BookConte4.replace("\n\n","<br>")! R2 Q5 R; r, }$ x
- print BookConte
+ U% n: H! j" k2 o9 [: @ - Book_Introduction2 = Book_Introduction1.replace(NewRe1 , ReC1)2 J7 v8 u L( g
- Book_Introduction3 = Book_Introduction2.replace(NewRe2 , ReC2)
. m5 l% X# `: t3 l - Book_Introduction4 = Book_Introduction3.replace(NewRe3 , ReC3)+ H5 O' h& |* x/ i+ }" P2 L
- Book_Introduction = re.sub(NewRe4 , ReC4 , Book_Introduction4)9 `' p' y0 i( o: Q2 F- H
- Titleid = response.url.split(BookID + "/")[-1].split("/")[0]
: s+ Z' ~" K; n( S! ^ - Book_img = response.save['img'], #小说图片3 f7 o9 Q3 N) q4 a" c
-
2 K3 ~+ ]2 {( E" x/ l* @" w - #insert into MySQL 小说入库
+ x2 ?* h. J4 i0 X - self.add_question(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布
m: @% _+ w$ [ - self.add_comment(Bookname, Booktitle, BookID, Titleid, Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布4 e* k0 q6 U5 S& L
- #post提交发布2 j* x% o% \, q& O: N$ @8 a
- self.add_locoy(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover) #这行可注译,火车头发布接口,不需要可取消9 ?- H! E; v$ j: o- G" u4 R; ~
- Datos = { {4 o; C6 r4 v3 \. e$ v4 T
- "Cater_Name":response.save['Cater_Name'],3 B. c& p! ?5 p; u
- "Book_author":response.save['Book_author'],: d7 M$ ?, N, Z( a; V
- "Book_Introduction":response.save['Book_Introduction'],$ ~7 Z2 I& p" g' b \
- "Book_Synopsis":response.save['Book_Synopsis'],& Z7 Z& f" d2 F2 e( D
- "Book_Palabras":response.save['Book_Palabras'],
9 F$ Y, U( B \) l/ E6 ` - "img":response.save['img'],
( I3 e" z1 [3 e% Y/ K - }0 I9 b5 A8 _7 E6 Q, l0 @
- for each in response.doc('.articlebtn > a:nth-child(4)[href*="/xiaoshuo"]').items():
- [+ Q- x& p/ F S3 i$ @0 W6 f9 G - self.crawl(each.attr.href, callback=self.detail_page,save=Datos)
$ b7 X: a3 `! z/ w7 H( |5 I$ L% C7 U - return {
" V. Y1 q& T9 u+ [* Y - "Cater_Name":Cater_Name,
# z3 g S0 e: n! S+ ^ - "Bookname":Bookname,
; R) i# F9 W, q7 A - "Book_author":Book_author,
' Q$ @9 y+ {+ t& P2 Q - "Book_Introduction":Book_Introduction,: M4 X+ d7 Y, W
- "Book_Synopsis":Book_Synopsis,, w6 Z1 N* O/ K
- "Book_Palabras":Book_Palabras,. N0 `# A7 X2 Z
- "Book_img":Book_img,. w& l: c* m0 V
- "Bookurl": response.url,
' a `( p- }! t0 h - "Booktitle": Booktitle,* b5 U, v8 v( a2 |- R
- "BookID": BookID,
# {) Z" w0 D* Z, j8 D8 f9 _ - "BookConte": BookConte,7 i( R+ t8 ^( p+ }) J
- "Titleid": Titleid,
& k: a7 o$ w8 m - "abover":abover,# j* r6 n9 {1 v/ Z
- # "Book_Date" = str(datetime.datetime.now()),
1 a. K. j1 z6 E, v3 m - }
9 i7 z# f3 H! T* [' ?1 k9 M - def download(self, P_dir, imgDir, file_name, Book_img):5 O- L# C; Q, b. ~9 o" N, _
- if not os.path.exists(imgDir): 5 @8 o w* _. ? S
- os.makedirs(imgDir)
. A" }! N- _' D - file = imgDir + "/" + file_name
2 F1 S: c3 Y9 z* V9 F - # print file: R% t) R" o& y E4 z* j- u; ^' o
- f = open(file, 'wb+')
* _( g0 ^8 n B: Q+ s* r - imag = requests.get(Book_img) ( L' q" w" O% f
- f.write(imag.content)- m1 A( w) n) f4 @
- f.close()
' m4 L/ q) E( N; U/ I) h; s - #保存图片前
. k, x* _1 {: E8 k" D - def save_imgs(self,response):, X+ T! z" M& E0 E" a/ R
- content = response.content
! X" m" x+ H: ^( R, T - file_name = response.save["file_name"]- D5 q( ~4 d- \( J: {+ L* l; t+ z
- imgDir = response.save["imgDir"]
4 ]9 T2 P" }9 l4 C- ~' e8 i. S& L, Q3 F0 C - file_path = imgDir + file_name
* w5 }/ B( H5 g" y$ d9 Y4 v - self.save_img(content,imgDir,file_path)9 ?+ {1 V* a9 I. A2 S
- #保存图片
* v, a, o% t$ X+ I; G$ S& S% Y4 u- p - def save_img(self,content,imgDir,path):
, o: U. Y/ G9 p- f1 |" r7 E - if not os.path.exists(imgDir):
- e" {2 Y! f6 m& ] - os.makedirs(imgDir)
$ q; Y5 M/ A* n - f = open(path,"wb" )
; v1 P( G2 h. v! F+ H+ w* }" R - f.write(content)
. }2 K0 w8 C3 z' ^ - f.close()
: w1 b4 C/ Q4 U8 b$ V# }/ o - #获取url后缀名. s1 d1 c9 W& E$ B3 x1 e
- def getExtension(self,url):
" o) F' k! Y" Q" v, w' h! B - extension = url.split(".")[-1]
; E/ q/ j: h8 I! O( O) x - return extension 1 M# {2 h! s- I
- $ W. a b$ G8 K! u" q
- #获取图片名+ E. F* A q7 ~8 l/ }* Z& {6 S
- def getname(self,url):8 V5 s3 c5 N, F0 o" |1 M7 ~
- name=url.split("/")[-1].split(".")[0]6 g" e" \2 w3 M$ ?, c. G( o
- return name
复制代码
, y' q0 L9 U& f" i1 {
, J/ l R, S4 `# P. g |