Python + pyspider某小说站的爬虫,入数据库,火车头发布,资源下载到本地,另可写爬虫!
1 [8 h; t9 G" H) S: P; R& u O- #!/usr/bin/env python
; O4 h+ X+ i/ u0 b) ^9 l t8 O - # -*- encoding: utf-8 -*-
. n Y1 Z; y7 i+ L6 b3 W - # Created on 2019-05-05 21:43:11
8 n, I% O0 p( S0 B& T - # Project: XiaoShuo
. b7 l; F: ~+ B6 n4 J0 e -
, q0 Q# C+ b4 o* N; e6 ] - from pyspider.libs.base_handler import *' F, e6 g$ m# j; X" Q/ R
- import pymysql
) v; ^- I% B1 n4 _, g - import random
; E& g# [4 c/ M7 p. c - import datetime
) L" R; v# Y' R- k1 T7 ?! o - import urllib2,HTMLParser,re$ J! U8 `- l1 d% l5 K; ]9 [* ?
- import os
3 k' e; w& `+ G& @" B - import sys$ r0 Y6 M2 Q: O7 h! q
- import re& _# J5 P% M% G' S( [2 C. `" ^6 u
- import codecs
1 I/ h& x' j |0 H9 W# z - import requests
/ Z( G4 G# m, J* d8 A# n4 P - import json
4 C. ?9 a/ a; B# }" ? -
1 G, M- b* f3 s% w# z" z; n" S - class Handler(BaseHandler):
, V# H# P; F: v$ {$ g - global Datos' q* a1 ~; N- I. R6 g* g
- global P_dir
! [ b/ n% d; g- L( C% s# _4 J - P_dir = '/Tools/Debug/' #采集时候图片保持到本地的路径
. d) m `& R& M% m - global Datos8 l! {3 ?" e# q" P9 N
- Datos = {}
' h3 a3 B; W6 L$ [; n/ q - headers= {) V8 v4 |9 D1 v* w. C# `$ u
- 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',: W! K. z. y9 { C% l
- 'Accept-Encoding':'gzip, deflate, sdch',: }5 M9 d& K. z( l) w
- 'Accept-Language':'zh-CN,zh;q=0.8',
4 x9 C4 R. F* t& i) O" t - 'Cache-Control':'max-age=0',
7 R5 V% w/ O- I: B1 p - 'Connection':'keep-alive',
+ \6 E7 V, i, }% ~+ Q$ G$ f* v - 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'
]4 M' ]3 Q! ? - }6 Z* q; `' e9 a' I/ ] r, K7 H
- crawl_config = {
6 W' `9 @4 z: J3 @/ s - 'headers' : headers,
0 F0 C9 E* Z5 ^" s/ i - 'timeout' : 300
9 _* _* k' i) Q - }
) w9 k+ o# o0 ]! r+ X% ]3 u - def add_BookFile(self,Bookname, BookIDs, img, Locaimg, Book_Dates):
1 _" s+ I/ r5 ~7 m9 R - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")! r) M3 S/ D7 M) s+ Y; m F: u) o
- try:5 \! O- s# Q* u
- cursor = db.cursor()
" ?; U1 \" a' b0 J o5 b3 n& P. K+ V - #注意此处字符串的占位符要加双引号"%s"
, c6 j) }$ M5 g6 Z. u: s9 { - sql = 'insert into BookFile(Bookname, BookID, img, Locaimg, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, BookIDs, img, Locaimg, Book_Dates);
' V( B: R0 j8 r5 E - # print(sql)9 P. m$ e& p/ Q [% j
- cursor.execute(sql)8 t6 x( q: u# h, G1 X' ~
- ' W" f1 u8 s7 A* S" S! s) ]
- #qid = cursor.lastrowid3 Y1 |) i. j! y$ f' z" k: W
- #print(qid)
# c8 d% l2 f) f0 w- Y9 E; l8 t -
2 V+ u- a3 A0 n+ ^3 B' }! e- x - db.commit()
- G P, g; `6 Q3 R, @; b - except Exception as err:
$ B4 b0 |3 q, |# s# ?4 d) @ - print("Error %s for execute sql: %s" % (err, sql))" J' t. V7 Q0 q5 o. M
- db.rollback()3 f% o7 v7 C/ ^% G; {' e
- def add_comment(self,Bookname, Booktitle, BookID, Titleid, Book_Date):
8 F: i4 h& _9 p! _5 X P - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
! [5 X. I9 [# F# L - try:# {9 ]7 T' j4 G( m4 N! H( y
- cursor = db.cursor()4 s2 S7 g8 w( z0 v* a, g: D
- #注意此处字符串的占位符要加双引号"%s"' }8 L! {' `6 O+ j
- sql = 'insert into BookTitle(Bookname, Booktitle, BookID, Titleid, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, Booktitle, BookID, Titleid, Book_Date);
3 L8 y) H0 Y) P$ z2 G1 Q# L% C! X - # print(sql)
& ~: o) b# g7 G' o3 o. M4 @ - cursor.execute(sql)" E( P+ F2 \( L9 y& B( t
- 3 L: h' f) i% e- @! i
- #qid = cursor.lastrowid+ c/ e% L* V8 R5 Q" h
- #print(qid)
" ^7 u/ N5 s( { J; G -
% k* x' Z6 ^; I8 F @( E. ]8 } - db.commit()
2 i% i7 b" {2 b; t. z - except Exception as err:( Z6 E( U: g" ?. v
- print("Error %s for execute sql: %s" % (err, sql))
* p6 w* q0 N, U! T- a! [1 v: q5 s - db.rollback()+ N% ?. i, b/ y+ {3 V5 s( w
- def add_question(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date): n( f. \2 m1 i5 Y; \; C, s" ~ j
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")+ n. @; Q& n- W. j
- try:
, ]. ?9 {0 v$ B6 ^4 n* L3 Q - cursor = db.cursor()1 C8 f8 s2 h0 E6 G
- #注意此处字符串的占位符要加双引号"%s"
5 S) Z+ Z( h; E: S, y7 D5 T: {* N - sql = ' INSERT INTO BookConte (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date);, G3 T) z# e. ~- w1 w0 @
- print(sql)) q+ \5 w3 K# Y, I
- cursor.execute(sql)
& m1 x" H2 r. c. X - print(cursor.lastrowid)' n4 }& b [' \% s7 H1 H
- db.commit()4 p9 B8 a0 A$ s! k: J# y! M1 k
- except Exception as err:
* R# h' k* @* d8 I5 V* g9 o - # except:
- q! }% d0 ~) ]% g* K, g) M/ E - # print('Failed')
9 A! S2 F& E ~6 w- c1 D - print("Error %s for execute sql: %s" % (err, sql))
2 i0 V4 `& R; m" @" h - db.rollback()' b+ X& g8 | v
- 1 o% B) M u3 A/ d$ x0 v& W
- def add_locoy(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover): ! W' U& R$ G- U* [
- reload(sys)
7 u; }! F1 `" \1 d - sys.setdefaultencoding("gbk")
; H% k A" v1 W$ o& Z/ P2 v - locoy_url = 'http://www.******.net/locoy/?my=book' #697火车头发接口地址
4 o9 _6 \( @; ~" E; o# } - locoy_data = {
5 y4 ?4 F1 ~* B, S - 'my_u':'用户名', #后台用户名0 o6 |. f5 o; `4 { l- G" d
- 'my_p':'密码', #后台密码
: H* ^( M: l, v- ~, Y) \% V - 'subject_669977_net':Bookname.encode('gbk', 'ignore'),* f+ `1 S+ `/ D/ @
- 'caid':Cater_Name.encode('gbk', 'ignore'),; f" H; ^1 A. r" B/ u. F( n; {
- 'title_669977_net':Booktitle.encode('gbk', 'ignore'),
. ?6 A! K+ G" Q2 s - 'article':BookConte.encode('gbk', 'ignore'),7 M0 m' o/ W1 B8 i* p, l
- 'author':Book_author.encode('gbk', 'ignore'),, ^2 ~- I& v' O5 p# b/ P9 I
- 'ready_1':Book_Palabras.encode('gbk', 'ignore'),
" z% I' s4 y W d1 p/ [ - 'thumb':Book_img,- t1 F0 b+ {/ C! z; M
- 'content':Book_Introduction.encode('gbk', 'ignore'),# ~- d) ~5 T! U& k2 A
- 'abover':abover.encode('gbk', 'ignore')
0 P) `6 z) Z; A: h* K2 R - }6 H i3 d1 i) s1 V
- res = requests.post(locoy_url, data=locoy_data)! E9 D2 l) _- z" ?2 F
- print res.text
0 R5 k# O) Y$ @1 A' E4 @! L - print res.content/ ~' M/ h6 B! R3 u0 n" ~
- # print Dsd
; R! {6 O8 x; ^ - return res0 q( R; u+ e+ E
- 0 Z5 i& z1 {( o: X/ T+ H$ {! K
- def __init__(self):
: I5 X0 z; E& s - self.base_url1 = 'https://www.****.cc/'
8 J: X: M' [3 A! ^ F: E2 | - self.base_url2 = '/'
B& o4 _- u4 X8 _ - self.CaterId = []! j$ @- `) t5 \1 v
- self.CaterIds = ['xuanhuan', 'wuxia', 'yanqing', 'lishi', 'kehuan', 'kongbu', 'nvsheng']$ ]6 p6 T3 T7 }" R; r
- self.page_num = 1! a2 H( r8 Q! q; M4 G& I
- self.total_num = 200
. b% K2 ~+ x0 v$ g/ m9 W -
% k& R) Y) h' `5 p; h' d/ ] - @every(minutes=8 * 60)( p( B$ {0 q1 J5 N+ [
- def on_start(self):1 F2 q3 {1 [: ~
- global Cater_Name, M; o" k. f( E6 m0 V1 c' e
- Cater_Name = []* v2 i5 U( Y6 _" I3 C/ w% U
- while self.page_num <= self.total_num: 8 P4 s' I: `% ~* u% {( N
- for self.CaterId in self.CaterIds:
% r5 C# }% d. @' v' W1 ?+ Q: b - if self.CaterId == 'xuanhuan':5 S) o9 e1 Z5 E* v& h B
- Cater_Name = '玄幻'
$ K5 C% z K6 Z( m - if self.CaterId == 'wuxia':
/ ^# a$ [! r2 ^" S' q" ~ - Cater_Name = '武侠'/ d2 n" |! N: y7 l& E1 B: P
- if self.CaterId == 'lishi':3 h# T$ K, O# v2 f% V1 q
- Cater_Name = '历史' 5 a% X0 T2 |% D. u- M
- if self.CaterId == 'yanqing':: G$ A+ l9 j k3 a( o
- Cater_Name = '都市'
4 R" n W6 _3 R. [7 F; Z - if self.CaterId == 'nvsheng':( j7 o* |) u6 W3 ?6 j
- Cater_Name = '都市'
9 O3 s6 s' g4 L0 Y - if self.CaterId == 'kehuan':
; R3 z/ |4 B4 d) w - Cater_Name = '科幻' 1 [0 l% o9 j4 o5 \9 m2 s
- if self.CaterId == 'kongbu':9 l2 s+ J+ ?7 u& ^: L, e5 L
- Cater_Name = '游戏'
6 D, z' n7 p) M) a7 D5 [1 D - print self.CaterId* Z3 z* O& x% e) C( Y- O O% \- w
- url = self.base_url1 + str(self.CaterId) + self.base_url2 + str(self.page_num) + "/" # w8 t8 P0 N' K8 u" R: N' }0 S
- self.crawl(url, callback=self.list_Caterg,save=Cater_Name)
6 ^9 j, o4 A8 @ n& r - self.page_num += 1 ; h( P7 n3 f* X
-
# ?" W0 F/ P, Y9 o$ \ - def list_Caterg(self, response):* l! g, X9 @( J; O3 E
- Cater_Name = response.save i, ~8 F3 C+ ^, l
- for each in response.doc('.pic-list a[href^="http"]').items():
/ C8 K( _' d- M5 c# P% U - self.crawl(each.attr.href, callback=self.list_Caterg_detail,save=Cater_Name)5 h( M7 |9 T; k) Q1 F1 o
- 5 y7 s( e1 x: j# N; @6 t, R% _
- def list_Caterg_detail(self, response):
; T$ g8 Y% m: U! t" [ - Cater_Name = response.save( r2 c1 Z' c# p
- # print Cater_Name+ q/ {9 u) L8 O: ]- |
- Bookname = response.doc('h1').text()+ V& d' X' s8 r8 p
- print Bookname
( ^7 T4 ~" v+ H& H9 j - Book_author = response.doc('.authorname > a').text()& S1 w4 w3 P# G" a s4 ]- H
- # print Book_author6 U. w# p4 L5 o3 E" b
- Book_Introduction = response.doc('.book-intro > div').text(); a8 T9 y' }: a8 b
- # print Book_Introduction
: F; |3 z* c# l. B3 X - Book_Synopsis = response.doc('b').eq(1).text()! x6 I* k. ?. @3 h
- # print Book_Synopsis
9 M4 k0 h3 A* s$ F - Book_Palabras = response.doc('.booktitle p').text().split(' ')[1].split('|')[0]
2 A7 B! X/ E- }$ [# J0 w- p" ~: e - # print Book_Palabras
# D8 X1 {; x, }) j, B- `0 e: J1 t - BookIDs = response.url.split("xiaoshuo/")[-1].split("/")[0] #小说ID
6 Z& r3 {1 o3 i2 C4 ~: [ - # print BookIDs; f- M8 A- n' [ C' m; `
- Book_Dates = str(datetime.datetime.now())
$ ~ H$ Z1 V6 P' E - for imgs in response.doc('.bigpic > img[src^="http"]').items():) Q( q/ [! g: n. E# S# [
- img = imgs.attr.src
) J& q9 n; _% J8 w - print img3 z' m6 }, f/ |. z) }
- #小说封面下载( d$ K6 S3 y' B @
- extension = self.getExtension(img)
# H; B3 A+ @" L H - name = self.getname(img)4 S6 Z6 z N. {* s. F. }) D8 R
- file_name = name + "." + extension
: \ L9 }" Z4 j3 t9 ~6 h2 a2 F7 c - imgDir = P_dir + name' j5 S7 L2 f" T1 h% M
- Locaimg = imgDir + "/" + file_name
) h1 Q9 E+ g& ~7 H/ W3 n6 R - print Locaimg
6 n: |2 Z- R6 ?" Y - if(self.download(P_dir, imgDir, file_name, img)): #这2行可注译,图片下载到本地
/ `2 m+ M& k( d! ? - print('attachment url is ' + img) #$ _6 o; {- ~9 u3 b' p
- Datos = {
6 W( }% N6 B0 D: t - "Cater_Name":Cater_Name,
3 h% O. D; P" j1 F - "Book_author":Book_author,$ ^; C8 ] w3 i/ V( J* R
- "Book_Introduction":Book_Introduction,0 u4 a+ b, n; O( ?( p9 P
- "Book_Synopsis":Book_Synopsis,0 c- B2 u) \* U: S' a
- "Book_Palabras":Book_Palabras,
0 `3 W( T, W, O1 B; u/ B4 \4 {8 O* S - "img":img,7 K! p* a4 k9 C, r0 c6 L
- }8 O, |) S! @7 n, U0 \
- self.add_BookFile(Bookname, BookIDs, img, Locaimg, Book_Dates) #这行可注译,数据库发布接口,方便其他系统的发布
: I$ \5 z+ M1 ^# l( Z; Z/ w! W5 ~. W3 o - for each in response.doc('div[class="bookbtn-txt"] a[class="catalogbtn"]').items():
5 b8 E; V2 D% H7 A7 Q - self.crawl(each.attr.href, callback=self.index_page,save=Datos)$ {6 j; j3 Y0 S9 c+ X# l0 R9 ~! G
- ; `9 @: Y6 e5 f9 Z( R( h
- @config(age=8 * 60 * 60)
" n5 e& U( r; ]* n# n4 f - def index_page(self, response):
2 ]. g* I4 f/ h8 r - Datos = { Y: W/ G5 y( ]
- "Cater_Name":response.save['Cater_Name'],
; v+ D9 e0 e ^ - "Book_author":response.save['Book_author'],1 W$ Q1 V* Z( @3 Q& h. D
- "Book_Introduction":response.save['Book_Introduction'],
$ V/ U' Z4 g7 K) L" l8 }: ^ - "Book_Synopsis":response.save['Book_Synopsis'],
! Q2 j* E. D& w0 Z% {8 T. B - "Book_Palabras":response.save['Book_Palabras'],! }" x- e& h6 T ~, @2 k
- "img":response.save['img'],/ m3 h# ~' P2 s5 O
- }: A" i2 |9 B" V) h: U( s
- for each in response.doc('.chapter-list li:first-child a[href^="http"]').items():4 W8 W. o' W- I9 E
- # for each in response.doc('.chapter-list a[href^="http"]').items(): ( ^1 Q8 X+ i: N* G. r2 V/ I0 Z
- self.crawl(each.attr.href, callback=self.detail_page,save=Datos)
) t9 @3 H4 M) o - @config(priority=2)
2 B M2 j( A% _8 ` - @catch_status_code_error
r% v+ _9 E; h: q! E$ { - def detail_page(self, response): * L# L1 E% d; E8 K
- NewRe1 = u'哈书'
6 E3 |; ^* G+ f- ~" R4 I - NewRe2 = u'huhjsd.CC'+ i, h a+ ?" \2 w. @/ G* c
- NewRe3 = r'^\\n\\n'
( h5 a) s/ j* W0 ^' r0 d6 G - NewRe5 = u'小说网'
4 N* L$ d2 }: F" i3 y - NewRe6 = u'fgdfgf'
& K; T+ T& J6 M+ ] - NewRe7 = u'fgfgf'* T6 I6 u- R! @' b
- NewRe8 = u'ffhgf'0 d4 S& H' n; n& e& Y* Q' s7 L) e6 c, p
- NewRe4 = r'[\f\t\v+\.\{\(\)\}\!\/_,$%^*(+"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+'
e- h- g" F) _ - ReC1 = u'静思'9 F. L1 r+ H* u; u" ^: c
- ReC2 = u'aghgf.com'
' ~& A4 I% ?6 `7 b7 I$ N - ReC3 = u'aghgfh.com'
6 {: h: g( Z; m' b* P4 x - ReC4 = u''9 q+ t1 u& b; G9 o) e
- ReC5 = u'文学网'
- v$ ^" a7 L5 z4 f$ d- w6 ]3 q2 s* Z - ReC6 = r'<BR>'
1 L3 V" o4 _, h/ f# O" A - Bookname = response.doc('.readlocation a').eq(2).text() #小说名称
5 I2 o$ E1 }8 J7 p! z. j* ^ - print Bookname
2 d# t- [0 U9 Y9 F - Cater_Name = response.save['Cater_Name'] # 小说分类
1 S4 |5 H# ]# j* Z( y& e - Book_author = response.save['Book_author'] #小说作者8 T9 d+ ]# c1 K# q d# z
- Book_Introduction1 = response.save['Book_Introduction'] #小说简介
# L! `, E. d' O# V2 d7 ~+ T1 b - Book_Synopsis = response.save['Book_Synopsis'] #最近更新; Z, u* g) [* Q3 T
- Book_Palabras = response.save['Book_Palabras'] #小说字数
* k8 m' `5 X" Q: ]8 s6 ~ - Bookurl = response.url #小说网址7 U( [+ u6 J1 M+ \1 D
- Booktitle = response.doc('.article-title').text() #章节名称/ f% x z$ u4 d. U* Q
- BookID = response.doc('.readset-r span').text() #小说ID
/ \$ U! k6 e, i5 j - BookConte1 = response.doc('.article-con').text() #小说章节内容
; `6 t" S$ L2 Z - abover = response.doc('.article-title').text() + response.save['Book_Synopsis'] + response.save['Book_Palabras'] + response.save['Book_Introduction'] #小说状态(连载还是完成). b2 n- `1 D( D, U; o
- Book_Date = str(datetime.datetime.now()) # 采集时间& Y" h3 o# n7 c4 I
- BookConte2 = BookConte1.replace(NewRe1 , ReC1)
- v: W; h" P, P, ^8 i - BookConte3 = BookConte2.replace(NewRe2 , ReC2)* A5 w3 {2 f, }3 |- g' ~. `
- BookConte5 = BookConte3.replace(NewRe5 , ReC5)
, E2 F0 ]# ~* X) S% x9 [: M7 b& Y - BookConte6 = BookConte5.replace(NewRe6 , ReC2)% q9 ^* S+ ?* N' m6 A
- BookConte7 = BookConte6.replace(NewRe7 , ReC2)9 t4 {5 }5 i) `! s
- BookConte8 = BookConte7.replace(NewRe3 , ReC6) W, z# W! M6 @- k- J2 U+ p
- BookConte4 = re.sub(NewRe4 , ReC4 , BookConte8)
6 v5 T. P: T: d( B- s - BookConte = BookConte4.replace("\n\n","<br>")
8 x+ m' X9 z# f2 k9 j - print BookConte
# H! Y1 B7 F( [# }( u - Book_Introduction2 = Book_Introduction1.replace(NewRe1 , ReC1)
0 P4 Z% X8 P% l6 |! T$ \ - Book_Introduction3 = Book_Introduction2.replace(NewRe2 , ReC2)
# h5 I& e5 Q% i/ S: b Y# g# S - Book_Introduction4 = Book_Introduction3.replace(NewRe3 , ReC3)/ ^4 i6 v. }6 f( H3 X$ j
- Book_Introduction = re.sub(NewRe4 , ReC4 , Book_Introduction4)& M4 j! [$ N# q6 _) q% O
- Titleid = response.url.split(BookID + "/")[-1].split("/")[0]
! o1 B0 @' }! l' v0 y - Book_img = response.save['img'], #小说图片
$ Q U; J$ p9 i - 1 q1 W* q/ v( M3 m+ C( G) d% P
- #insert into MySQL 小说入库: S3 M- y5 ~, X, I
- self.add_question(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布
$ v1 b& o# @4 }; A2 d4 g - self.add_comment(Bookname, Booktitle, BookID, Titleid, Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布
: ^% j9 O2 W7 d' s! I# m - #post提交发布
* f- e6 z' ]' P' p9 E5 m - self.add_locoy(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover) #这行可注译,火车头发布接口,不需要可取消
- d3 v$ B+ y0 k. m8 O2 m - Datos = {
9 W3 C, X5 j) y3 i/ P D - "Cater_Name":response.save['Cater_Name'],
- d |" u Y4 F5 C1 Z9 _ - "Book_author":response.save['Book_author'],' } `, ?1 d9 @7 V" ?" p
- "Book_Introduction":response.save['Book_Introduction'],
3 Y6 d$ h; g1 I - "Book_Synopsis":response.save['Book_Synopsis'],# | b8 ?2 X u# j
- "Book_Palabras":response.save['Book_Palabras'],
: e* s: T0 |( n - "img":response.save['img'],
W: [: W+ |) D$ |1 c - }
: V9 S8 s2 p7 O& k, u& k6 Q - for each in response.doc('.articlebtn > a:nth-child(4)[href*="/xiaoshuo"]').items():5 d) D: q o) U9 t6 o5 t) d% R% x
- self.crawl(each.attr.href, callback=self.detail_page,save=Datos) ; k8 ?# i9 j9 Q: @- b' r. d
- return {
7 w9 p5 k8 h# B& m5 H- C/ d - "Cater_Name":Cater_Name,
+ r$ c5 b1 C6 O3 @ - "Bookname":Bookname,# R2 ?' Q( B1 C1 n8 a# E
- "Book_author":Book_author,
7 v }& I t! C& y - "Book_Introduction":Book_Introduction,
9 h/ w6 ~/ p4 `* I' ]1 x - "Book_Synopsis":Book_Synopsis,
! C+ H# R% a& M1 m - "Book_Palabras":Book_Palabras,9 X, C2 v9 j; ]% G6 y+ `- p+ S6 Q
- "Book_img":Book_img,
4 [) ^9 `: s7 }( i# I - "Bookurl": response.url,
+ j( z1 E* J% E! I# D - "Booktitle": Booktitle,
5 |1 Q5 f9 X& T& D; R: n - "BookID": BookID,, t8 j8 `8 c: M/ [$ ~' U4 _
- "BookConte": BookConte,
* Y4 r. D8 d9 S - "Titleid": Titleid,
4 ]$ t# T* ?" r. k b: s5 p - "abover":abover,
( C! Q" z& l1 Z/ ` - # "Book_Date" = str(datetime.datetime.now())," C/ { f* r0 z3 G& _
- }
& u7 }& o# J. ~' u - def download(self, P_dir, imgDir, file_name, Book_img):4 z# f, ]7 I8 |" X. j, n J
- if not os.path.exists(imgDir): : S' t" x0 M9 S4 E' t- ^
- os.makedirs(imgDir)
" `' \6 S" f4 o' f2 [ - file = imgDir + "/" + file_name$ N0 L, d6 q$ K; S+ s- J& E e
- # print file* f* z8 [+ U, b9 i3 y' L1 N* m
- f = open(file, 'wb+')1 \+ ]1 g9 U& p2 \4 z6 e# k+ e& h
- imag = requests.get(Book_img) " l0 X* b( {( C t! E/ f! o9 _+ f
- f.write(imag.content)
6 y5 s+ J. a& q3 L( p! ~: \8 u - f.close(); R. }, z, {; ?4 l: H
- #保存图片前+ T( ~5 e$ A/ x; \+ L5 L
- def save_imgs(self,response):
7 u5 V, P5 N Q0 C0 H - content = response.content: c+ l3 E# {' f) a
- file_name = response.save["file_name"]4 Z$ z" \: j2 ]+ N* f' f) N2 x
- imgDir = response.save["imgDir"]
: G6 i; y5 ]9 N1 ~ - file_path = imgDir + file_name
4 {1 w. W0 o# _. s2 \, u3 V; q - self.save_img(content,imgDir,file_path)
6 U6 b4 v N |# k - #保存图片6 S0 R2 B' E* N f5 B* c
- def save_img(self,content,imgDir,path):& A; y4 m3 w5 A2 N6 y- ]
- if not os.path.exists(imgDir): 6 u3 p) ~+ s" q& Y# u! N! O
- os.makedirs(imgDir)+ s9 c1 W# d. c% m0 \5 o
- f = open(path,"wb" )
) N: Z; P n2 ?0 c& W; u. T - f.write(content)
7 G4 a$ |9 y! t/ ^" K - f.close()
9 j" {1 A I4 X; m! E* v - #获取url后缀名
$ y6 W; Q9 I. Y2 s - def getExtension(self,url): : d6 r9 I; M; ~1 b) v5 g% F) V: h
- extension = url.split(".")[-1]0 s( `. Z+ T: f/ J, Z
- return extension 6 Y' z$ [! v$ `2 n( r0 {8 E, }
-
7 d9 T, ?9 Y: ~6 k Z" E - #获取图片名4 W" u$ {/ K# S4 Z8 q
- def getname(self,url):( z0 U% B0 D) i# [- l9 b$ K
- name=url.split("/")[-1].split(".")[0]
9 s5 ?: f1 b% P4 n - return name
复制代码
; E9 ~$ f6 ?: \+ Z: g p . H: S6 @: m* U
|