Python + pyspider某小说站的爬虫,入数据库,火车头发布,资源下载到本地,另可写爬虫!
, b4 m# W% W9 }& @3 Z e2 ~- #!/usr/bin/env python3 J/ b4 b( N- O5 o
- # -*- encoding: utf-8 -*-
$ o# u& F. B$ f' c5 D9 x9 ?( h1 w - # Created on 2019-05-05 21:43:11
. {0 J! u/ m! C; u& u5 E - # Project: XiaoShuo
3 ^ N p1 {7 R0 ^, q2 v! D, n - " b9 S* |( o# }7 W
- from pyspider.libs.base_handler import *6 X8 z }; s. A) E9 q
- import pymysql
' d- e7 s; p L8 Q+ Y - import random* l! g! S$ p- G8 R) S
- import datetime
; t+ h% [# p4 M% k* ?: |( v# H - import urllib2,HTMLParser,re
( A# Y0 f: l5 T - import os. ~+ g3 J2 c5 Y6 p$ a
- import sys
+ k3 f2 o3 o7 t+ A; u& z - import re
) K2 }2 I/ b' }( L6 F4 M - import codecs' X! l: @! N- y8 R- Z: _8 s
- import requests. k! H2 @. r# {* V( q5 a
- import json
$ X! k3 p2 Z8 B$ R3 ], ~ f. K& }) h - " N. h' m% `' x C) W
- class Handler(BaseHandler):
+ c: i7 D, M& N7 s4 G* x6 r - global Datos
) ]" ]6 t" o- I u6 V( C - global P_dir 1 z! |3 w: O/ y
- P_dir = '/Tools/Debug/' #采集时候图片保持到本地的路径% S; o" p" @- B( z; @
- global Datos2 D! g" }% Y" Y. z9 }8 f
- Datos = {}
1 g/ z2 t' i: N- W8 P - headers= {) } W6 Q1 z( [
- 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
" o; T. X$ X) t( Y - 'Accept-Encoding':'gzip, deflate, sdch',4 A) Z6 B7 l" R7 p) }2 V
- 'Accept-Language':'zh-CN,zh;q=0.8'," w3 P0 }/ X1 [6 g9 ^& Z( y
- 'Cache-Control':'max-age=0',& h2 E+ ~/ {% w9 r
- 'Connection':'keep-alive',6 g3 c4 r1 r; i0 W
- 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'. k8 m1 |6 ?$ L
- } e j7 i( y" N' W; M" {
- crawl_config = {- x3 l2 A- \! G6 a+ z
- 'headers' : headers,0 `/ {( g: U- u/ D# |1 l- o
- 'timeout' : 300
+ d0 L7 M3 Z3 K- L: i9 o - }
& v$ i0 F G* U4 W3 [ - def add_BookFile(self,Bookname, BookIDs, img, Locaimg, Book_Dates):
! F H, A' Z7 p: Q' I - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")0 e ]3 ^5 @9 Y l" q' n4 x
- try:, O% i! C( }' p. j8 V* L
- cursor = db.cursor()
# `7 U: m, A3 |" c - #注意此处字符串的占位符要加双引号"%s"
* l6 h2 r; y M; c/ b4 Q - sql = 'insert into BookFile(Bookname, BookID, img, Locaimg, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, BookIDs, img, Locaimg, Book_Dates);
6 I* R% K2 Q( h( S$ T2 K4 @ - # print(sql)( u9 Z# u2 o, I2 x
- cursor.execute(sql)
% D! s: m* ~; i6 T. F9 c3 N: W4 I9 e -
; W. i) M! O0 {" y - #qid = cursor.lastrowid8 _) S4 S& Y+ {
- #print(qid). s. Q! x; u% K: ]( e: x! _
-
: S6 V/ C3 Y2 j - db.commit()
- g6 @# z! h) B/ i8 a6 l - except Exception as err:8 E+ U, W# N% ~) k. R6 @' R
- print("Error %s for execute sql: %s" % (err, sql))
' W1 v- {* W7 q7 S/ a" O - db.rollback()
7 z: a: d" m5 m - def add_comment(self,Bookname, Booktitle, BookID, Titleid, Book_Date):
Y+ b9 b" N: z" e - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")/ G* K) R( B$ n" r( e- F
- try:
0 F. o+ y/ I, V& i' W - cursor = db.cursor()
) Z8 t9 y) @# h; R3 z8 i - #注意此处字符串的占位符要加双引号"%s"
) @; J+ Z1 O P- a - sql = 'insert into BookTitle(Bookname, Booktitle, BookID, Titleid, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, Booktitle, BookID, Titleid, Book_Date);; d" @! a g" k5 M! Q' T
- # print(sql)* f( u8 |7 V9 K/ E9 G7 I/ D) e( X
- cursor.execute(sql)
& w3 l+ _' n" k5 `6 d3 U -
1 V9 s3 l. m% P6 J; W" V8 X - #qid = cursor.lastrowid. `# c4 ?& m+ H. t
- #print(qid)
: k# c8 y( }7 j -
( Q/ p/ L( w, S2 s - db.commit()' B9 j! g/ L3 M, {4 y) j
- except Exception as err:* `+ [' d x, Q w& p* u
- print("Error %s for execute sql: %s" % (err, sql))
% `! |$ Z x& Z9 m2 q4 E - db.rollback(): C" T) V4 q# \, o5 b
- def add_question(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date):
0 w% X: S9 T- r, B# q/ I5 D: V - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
- F2 c3 r" E1 K# e2 ^$ A - try:* ?9 @+ ]/ H/ \0 e7 i( [0 o$ t
- cursor = db.cursor()* ~1 f9 m# `: {9 I* Y7 Q2 D
- #注意此处字符串的占位符要加双引号"%s"
3 Q7 {! R% Z, m+ c9 j - sql = ' INSERT INTO BookConte (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date);
( J) A' d7 l- U% R$ ` - print(sql)
- p9 L( t0 d" i+ _ - cursor.execute(sql)) s W/ ^3 Q# ]. W2 X8 x- {8 Q
- print(cursor.lastrowid)5 B2 Z4 a" K! |! c% K
- db.commit()$ B9 q3 M+ u9 h( m8 e
- except Exception as err:
0 A. _4 f$ [+ y, \# B) L9 A. \ - # except:3 {; D% Q1 P; U, N" H# r' ]2 ?
- # print('Failed')% m# X0 A. P& f) g$ u6 W
- print("Error %s for execute sql: %s" % (err, sql))7 M" K0 B8 q& e p% S+ K' n- p- C- z
- db.rollback()
) g1 Y5 J+ s" ^7 R4 { - 8 O/ U4 N M2 p# `. Y2 v
- def add_locoy(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover): 2 w/ D* e2 x! p2 i
- reload(sys)
x, B% ]0 v( P" x* h) x - sys.setdefaultencoding("gbk")$ c) I1 S# A! ^- s' V7 |/ c
- locoy_url = 'http://www.******.net/locoy/?my=book' #697火车头发接口地址
! Q5 f7 F' |* r8 q. P5 n% P" U7 x - locoy_data = {" s% L; H6 Y' K6 |( k N0 r
- 'my_u':'用户名', #后台用户名
* q! w1 C/ C7 t; _; e. } - 'my_p':'密码', #后台密码
& A3 A8 n) q _* T, @: |% T0 W - 'subject_669977_net':Bookname.encode('gbk', 'ignore'),/ k0 [0 \! Z9 N" a2 d7 X4 W' ]+ N
- 'caid':Cater_Name.encode('gbk', 'ignore'),, w4 d3 k6 E8 U
- 'title_669977_net':Booktitle.encode('gbk', 'ignore'),
# z7 n1 o( W- _& | - 'article':BookConte.encode('gbk', 'ignore'),
4 a3 N# b1 z, q W7 U, M/ c - 'author':Book_author.encode('gbk', 'ignore'),( X& b1 d9 d0 q' N
- 'ready_1':Book_Palabras.encode('gbk', 'ignore'),0 k1 @, Y8 u8 x7 t
- 'thumb':Book_img,% ?* N( [$ N2 x* W" C3 T
- 'content':Book_Introduction.encode('gbk', 'ignore'),! J5 W* L# D. I% j. b5 n
- 'abover':abover.encode('gbk', 'ignore')
2 k' Y& X, q: p0 \3 v - }
, q2 e- i& a# x" H. N; ~ - res = requests.post(locoy_url, data=locoy_data)
3 Q" _9 L1 Q$ n M- \8 U - print res.text5 S2 K+ y4 g8 }# F/ k
- print res.content6 L( ]" H* v9 c% ?3 F9 i; J
- # print Dsd
) ^" p3 t7 x& e- ^# Z5 O - return res' o6 X9 u3 }. v
- " v8 E" |; l1 i( D; ^4 }
- def __init__(self):
- S5 A; E% }) G - self.base_url1 = 'https://www.****.cc/'
k4 S) u# ?4 \/ U) ]5 H$ E - self.base_url2 = '/'
/ q: E6 D/ }5 c( D4 e - self.CaterId = []
! E8 C9 Y3 |9 l- J5 d - self.CaterIds = ['xuanhuan', 'wuxia', 'yanqing', 'lishi', 'kehuan', 'kongbu', 'nvsheng']
; b4 p2 t H% z3 I0 R+ i4 q( [ - self.page_num = 1$ ^8 w: H; |1 }" P
- self.total_num = 200
# b/ C' \* ]) q - 2 |5 `0 j/ A+ z7 Z- h
- @every(minutes=8 * 60)
8 s- T" s% |$ B+ h( {2 |) q - def on_start(self):8 b" y! M" w; M5 \
- global Cater_Name
8 Y, O( H5 R+ p ~6 N5 I( M: X) N - Cater_Name = []
# w2 G8 @2 t. Z6 W - while self.page_num <= self.total_num:
; F% r+ A M# E, T - for self.CaterId in self.CaterIds:# A4 D3 V0 `/ j& U- \( N
- if self.CaterId == 'xuanhuan':
! {% |9 l: `8 A8 G& X& C2 a3 M6 B - Cater_Name = '玄幻'2 k9 v) j, o8 ]# _& F7 V
- if self.CaterId == 'wuxia':
$ P( C7 g2 K- L# b% l - Cater_Name = '武侠') F7 E, {5 h3 k
- if self.CaterId == 'lishi':, _+ B4 W* w" z6 B0 M
- Cater_Name = '历史' 1 R6 F; j' t4 J
- if self.CaterId == 'yanqing':0 B$ r" ]7 G+ R
- Cater_Name = '都市'
, K1 c& w: K; o# G3 G8 }- _8 C. Q @ - if self.CaterId == 'nvsheng':) U( q; o4 g, b+ K$ C- n% M
- Cater_Name = '都市'
+ R4 |+ P0 {) T- a. P4 E: ]8 j - if self.CaterId == 'kehuan':
T' C- h' ^ p2 w/ w - Cater_Name = '科幻' 8 O/ z: v/ K* u
- if self.CaterId == 'kongbu':2 Q: @" {: W: y! c! p% J
- Cater_Name = '游戏' 0 ~ h K0 P; v# |* B( Y# Q% q0 G& ~
- print self.CaterId
. l0 N/ N& x' v; { - url = self.base_url1 + str(self.CaterId) + self.base_url2 + str(self.page_num) + "/" & g, R7 o7 ~" h' U7 I
- self.crawl(url, callback=self.list_Caterg,save=Cater_Name)9 E6 r/ _" ?- f! U: D/ G3 o
- self.page_num += 1 - O$ b3 Y# J8 {1 ~% D- ]
-
' N1 w& q5 J+ v0 `# K6 N - def list_Caterg(self, response):
U' I" s. P* w+ z5 b/ d - Cater_Name = response.save
! I" {. I! i; I - for each in response.doc('.pic-list a[href^="http"]').items():* i9 S1 z) a$ V& ? |1 h: Z4 w
- self.crawl(each.attr.href, callback=self.list_Caterg_detail,save=Cater_Name)
, Z( [* C" e( c4 L - 3 e. L4 Q( n, \8 _ K, M
- def list_Caterg_detail(self, response):) p! G- Z! B/ Y' N3 ?$ D- _
- Cater_Name = response.save0 L. p* s4 f* l6 R$ F
- # print Cater_Name
' j+ ~9 d4 S' y' Z9 e - Bookname = response.doc('h1').text()
! p- |6 ` y9 T$ e# D$ i* A- Z3 s - print Bookname
6 m# S; v, J7 | - Book_author = response.doc('.authorname > a').text()
2 y" i+ E' q8 H3 a - # print Book_author' a. Q, g5 t( ]! K# T% Q
- Book_Introduction = response.doc('.book-intro > div').text()3 ]! ^( Y) X" k$ |7 s4 j. w
- # print Book_Introduction
# t B' b7 ?7 ~% Q. p& a, m+ }' n - Book_Synopsis = response.doc('b').eq(1).text()
0 _3 e* B P# b( {( {2 j. n - # print Book_Synopsis7 p, p$ \0 |' T- H8 |: ~
- Book_Palabras = response.doc('.booktitle p').text().split(' ')[1].split('|')[0]3 ?8 A$ n! O i. e* x- ?* v& [6 |
- # print Book_Palabras8 F0 g9 D2 p. K6 p/ b
- BookIDs = response.url.split("xiaoshuo/")[-1].split("/")[0] #小说ID
9 p% j0 Y( a% v' O: t# x. n8 ^ - # print BookIDs/ r8 y% |9 @: }. q: M* J. e3 @
- Book_Dates = str(datetime.datetime.now())
+ L2 F( G) W/ Z( v4 X/ y" y' J - for imgs in response.doc('.bigpic > img[src^="http"]').items():5 h8 |4 o8 d/ A3 E) D, z
- img = imgs.attr.src6 o0 A: u8 v5 a' H. t1 ?* j
- print img
* ? _# Q' y$ F3 t - #小说封面下载( n7 `+ y |* o0 K4 y, x9 m9 D, I- [
- extension = self.getExtension(img)
' [ W! n: @9 ^0 v' A - name = self.getname(img)( X: m7 n. M, a$ F( m9 q
- file_name = name + "." + extension
# d' w. d, Z, Z) m# L$ g - imgDir = P_dir + name1 E- G2 }- u) R+ D. N* Q B* C" \
- Locaimg = imgDir + "/" + file_name
, o1 \! Z; P, J - print Locaimg
0 R7 H$ r# }9 U. y5 [/ ^# g+ h - if(self.download(P_dir, imgDir, file_name, img)): #这2行可注译,图片下载到本地( O$ K) g3 t& n
- print('attachment url is ' + img) #
5 f! ?9 d# D& u5 r; x4 ]# I, r - Datos = {
( D: `+ x6 U# `. x( j9 [- j8 ]0 h - "Cater_Name":Cater_Name,
" c5 l4 \5 E2 {4 X5 Y1 Z - "Book_author":Book_author,
, S$ g! c/ {+ ]0 e# R - "Book_Introduction":Book_Introduction,% @' K+ j, u* q: Z7 |$ `7 ] z
- "Book_Synopsis":Book_Synopsis,% h* e3 m% t9 B6 K# \' y, j' }$ s
- "Book_Palabras":Book_Palabras,+ _: }, A: Z4 a3 Q
- "img":img,& S1 V* y2 Y0 r( x' j
- }
; ?1 H/ W- } U2 P+ Q' M+ N8 j/ L - self.add_BookFile(Bookname, BookIDs, img, Locaimg, Book_Dates) #这行可注译,数据库发布接口,方便其他系统的发布
) q0 \! C) z5 L2 L# G6 N - for each in response.doc('div[class="bookbtn-txt"] a[class="catalogbtn"]').items():1 U7 @1 p5 N9 G6 z+ n5 M8 O
- self.crawl(each.attr.href, callback=self.index_page,save=Datos)/ |% o& s) m3 Y
-
! d/ T! G3 X) u - @config(age=8 * 60 * 60) 1 y+ n( y: E/ s* e7 ]3 A) G
- def index_page(self, response):
$ Z9 z3 N7 I/ |8 b* d- \# J4 a- j* |* K - Datos = {
( h7 j4 t. U$ S8 Z - "Cater_Name":response.save['Cater_Name'],
* r& `% M. U4 u! f+ R9 Z0 K# z - "Book_author":response.save['Book_author'],
l4 g- d: s, |9 ?* f. s - "Book_Introduction":response.save['Book_Introduction'],) D6 Y! e( @- ~5 q) j E9 B
- "Book_Synopsis":response.save['Book_Synopsis'],
$ _5 H# r5 @2 L9 x8 @ - "Book_Palabras":response.save['Book_Palabras'],; }7 d! B' I" R9 o7 b
- "img":response.save['img'],2 |+ L- ]4 j m: l
- }
+ C5 a/ l* c% P( ~3 V - for each in response.doc('.chapter-list li:first-child a[href^="http"]').items():
8 V, G$ m' V0 f! J - # for each in response.doc('.chapter-list a[href^="http"]').items():
9 t0 N/ G1 Z) P' J" a# Q, l - self.crawl(each.attr.href, callback=self.detail_page,save=Datos)4 h3 W8 W0 O) w
- @config(priority=2)2 R7 q3 z2 Z5 i( C# r& @
- @catch_status_code_error9 H/ G. W1 A' ^7 n0 _
- def detail_page(self, response): 4 q% W- b! U8 q( s; G- I- ~
- NewRe1 = u'哈书'
' F) J) T0 j6 _3 q0 P( w, I) R - NewRe2 = u'huhjsd.CC'2 U) i5 f5 Y8 e! ^
- NewRe3 = r'^\\n\\n'
+ r7 i& P6 h0 ~% g - NewRe5 = u'小说网'- T. t+ ?5 k) ]
- NewRe6 = u'fgdfgf') a7 Z; {/ K' Y' X; E0 g
- NewRe7 = u'fgfgf'
4 q( r5 a" q% E. {# s - NewRe8 = u'ffhgf'
9 o6 e5 H9 p( o) U# W6 j - NewRe4 = r'[\f\t\v+\.\{\(\)\}\!\/_,$%^*(+"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+'9 P/ D& X' o: w2 K0 J2 j% h6 }
- ReC1 = u'静思'5 C+ l6 P5 [2 Q7 s
- ReC2 = u'aghgf.com'0 f# G) \3 h$ a* }
- ReC3 = u'aghgfh.com'& N2 C- l* x& C v; C6 C9 F5 _6 w0 E
- ReC4 = u''& j, t9 _! U6 r3 H+ S7 L* g/ E7 `
- ReC5 = u'文学网'1 f% G! Y; M# ^4 W* ?. M# F$ s
- ReC6 = r'<BR>': X3 z& E6 k& O1 m
- Bookname = response.doc('.readlocation a').eq(2).text() #小说名称- z' F6 ^/ W" Z: i' h
- print Bookname j- t0 m) \, Z+ _# M- X
- Cater_Name = response.save['Cater_Name'] # 小说分类
6 g+ X2 b( I- \3 P# f6 O0 `" T - Book_author = response.save['Book_author'] #小说作者: q' D5 X% T- O3 \4 s( B
- Book_Introduction1 = response.save['Book_Introduction'] #小说简介2 g5 ~6 J. u- w
- Book_Synopsis = response.save['Book_Synopsis'] #最近更新: m4 `% @8 U' y! v
- Book_Palabras = response.save['Book_Palabras'] #小说字数
6 R: | c% c- V: _" O% ?6 s$ v - Bookurl = response.url #小说网址0 X2 s# h B' J/ s7 e" f
- Booktitle = response.doc('.article-title').text() #章节名称) {3 o8 j& D" v% ^1 M
- BookID = response.doc('.readset-r span').text() #小说ID
0 E+ D7 b% o/ ~0 r4 W1 C9 c - BookConte1 = response.doc('.article-con').text() #小说章节内容
3 I8 ~/ V1 j1 S+ O( N1 G( D) J& H4 F - abover = response.doc('.article-title').text() + response.save['Book_Synopsis'] + response.save['Book_Palabras'] + response.save['Book_Introduction'] #小说状态(连载还是完成)
4 m% Y' {- ~- a U9 I5 H - Book_Date = str(datetime.datetime.now()) # 采集时间
* |' ^) S: t4 l! V2 c- {* }7 V - BookConte2 = BookConte1.replace(NewRe1 , ReC1)
5 ^/ ?; q: S* p) G8 T+ s$ b - BookConte3 = BookConte2.replace(NewRe2 , ReC2)$ ?- \/ g$ c! c: Q- D$ ^9 Z3 C
- BookConte5 = BookConte3.replace(NewRe5 , ReC5)" k8 O9 W- x4 F, I7 G0 g2 p
- BookConte6 = BookConte5.replace(NewRe6 , ReC2)' j( T. M' \, c( o9 x! `
- BookConte7 = BookConte6.replace(NewRe7 , ReC2)
- F" G, c0 J) Z. ] - BookConte8 = BookConte7.replace(NewRe3 , ReC6)
+ T1 f0 G8 l9 ? }; ~: Y$ m - BookConte4 = re.sub(NewRe4 , ReC4 , BookConte8)
& l7 J8 L2 s7 y: i2 m1 K! \: Q+ @' L - BookConte = BookConte4.replace("\n\n","<br>")
3 k9 x7 q' Z9 q' ?! _ - print BookConte |% N) H8 e% ?- d3 ^1 k* q# \
- Book_Introduction2 = Book_Introduction1.replace(NewRe1 , ReC1)
3 U. K/ Z, B6 l* \# |# e$ j: B - Book_Introduction3 = Book_Introduction2.replace(NewRe2 , ReC2)0 h, K& q0 N8 k3 `# Q% W
- Book_Introduction4 = Book_Introduction3.replace(NewRe3 , ReC3)
; G @' c% m% q( d - Book_Introduction = re.sub(NewRe4 , ReC4 , Book_Introduction4)( d. b3 Q$ G2 M1 s5 B* x! w9 H
- Titleid = response.url.split(BookID + "/")[-1].split("/")[0]
: y9 O/ i6 r% \2 I* D - Book_img = response.save['img'], #小说图片# @$ J R# B2 }
-
" |0 F$ J1 s. Y - #insert into MySQL 小说入库; K' C i+ f% Y: s/ ]" [0 L4 Q
- self.add_question(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布
$ L2 y# z0 j* o- \9 U6 R3 l: L - self.add_comment(Bookname, Booktitle, BookID, Titleid, Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布
. O" T8 Z* Q" g" X! p1 m) \7 L5 Z$ ` - #post提交发布9 s6 M1 e0 G/ J7 m( F
- self.add_locoy(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover) #这行可注译,火车头发布接口,不需要可取消/ l% W- r, j1 }. e
- Datos = {
5 D, {3 R, w. c# i$ _% { - "Cater_Name":response.save['Cater_Name'],( l( o$ ~# V) j! E. C, {; B# z1 W2 b
- "Book_author":response.save['Book_author'],0 d* Y2 c) x: s) D) w. s. [ ^, k0 ^
- "Book_Introduction":response.save['Book_Introduction'],
: E0 Y% p. F& U9 [# V1 D - "Book_Synopsis":response.save['Book_Synopsis'],* W2 V) g7 L4 Z/ R* d. V* ?
- "Book_Palabras":response.save['Book_Palabras'],
: {/ W9 b% s- a8 g2 b: } - "img":response.save['img'],
* q$ K( `* ]7 _% [; P( [ - }
' S( g2 }( Z: {: e" K6 D$ k7 z: [* J9 t - for each in response.doc('.articlebtn > a:nth-child(4)[href*="/xiaoshuo"]').items():( A' `4 \. S8 z4 M. ^; d j
- self.crawl(each.attr.href, callback=self.detail_page,save=Datos)
2 \4 M8 l7 J' E# H9 I8 T - return {7 g3 T# K3 B6 O( E1 F# Z/ z t
- "Cater_Name":Cater_Name,4 |, Q) ?7 D i8 U% i! y, Q
- "Bookname":Bookname,
: ~: y8 O% i- K6 @ - "Book_author":Book_author,
]1 B4 x% O* e2 a2 O! S0 w - "Book_Introduction":Book_Introduction,
1 v# ?+ {/ h6 M: t6 ~ - "Book_Synopsis":Book_Synopsis,
5 D2 [' j: t) y6 ^( ~! b2 z1 \ - "Book_Palabras":Book_Palabras, u) ]" T: i& J" u: ]
- "Book_img":Book_img,
2 e/ x4 ~: t; k) K# F! }# r* j - "Bookurl": response.url,
% V/ O+ f" |( p4 o# D! L H - "Booktitle": Booktitle,3 p, m+ E' I/ K( D
- "BookID": BookID,
0 X; M" w' I1 Z# ?9 F* q; | S - "BookConte": BookConte,$ I d& o$ V# _, D
- "Titleid": Titleid,6 g; j+ ?$ Q, I! @5 ^& M
- "abover":abover,
* k& m4 u- M" K" j# d0 J* P" L - # "Book_Date" = str(datetime.datetime.now()),1 I# r- k5 e% G. y
- }
- O/ d3 R$ B4 w0 c; A) k4 G - def download(self, P_dir, imgDir, file_name, Book_img):1 J5 }# d* m( | {+ E5 o
- if not os.path.exists(imgDir): ' `" c& P$ Y! U( I- n% T% |
- os.makedirs(imgDir)% p7 L6 ]2 `- ~/ E- b D
- file = imgDir + "/" + file_name
0 u6 Q* [) N2 w9 V3 @ - # print file, E+ `3 p4 G' v
- f = open(file, 'wb+')! r% ~. ]: m" H4 r/ k, y: r2 ]
- imag = requests.get(Book_img) ( d4 k: G# w7 u& C+ Q+ a5 {( o
- f.write(imag.content)
8 u. O/ O8 p; I7 Z. b - f.close()
# F1 o+ P, O# }0 B( j3 n" P - #保存图片前* m0 O2 g; W5 D0 D
- def save_imgs(self,response):( \6 |" r0 V7 u' y+ t
- content = response.content' b+ x: w1 \1 x2 [9 J1 B
- file_name = response.save["file_name"]1 @2 L& h N. F! y9 ~
- imgDir = response.save["imgDir"]
5 D8 r) E8 ~ i1 \1 o$ H0 [ - file_path = imgDir + file_name* v4 r+ l* B" E, u$ V7 ]9 N- u
- self.save_img(content,imgDir,file_path)
; v3 m" R0 m1 R) Z( B @0 X - #保存图片
H; ?7 o! t- [ - def save_img(self,content,imgDir,path):4 `0 | r! q& C; T: {3 d
- if not os.path.exists(imgDir):
: ^+ {* C6 [0 j - os.makedirs(imgDir)
( Y, K* H- k/ G8 g - f = open(path,"wb" )+ u1 r1 B4 i# s4 F! }
- f.write(content)
& {1 M5 Q1 E9 a: i" w - f.close()* W) X, H8 P; C6 K$ ~( C9 c
- #获取url后缀名
1 j, u6 q3 W* l) H" |* ` R - def getExtension(self,url):
- a+ t9 D# b6 }; v4 e9 H - extension = url.split(".")[-1]0 m9 J5 F" s; u, l# d( s8 i
- return extension
% b& i! F- L1 Y -
' r- V- N. W5 _- O - #获取图片名
6 ?9 h# o; ]8 U* p2 M6 a5 p2 F# n' R- \ - def getname(self,url):) b) ], Z. Y$ V; @- C% y! g8 X
- name=url.split("/")[-1].split(".")[0]+ o+ T! Y8 I+ E) G( h
- return name
复制代码
* b" N/ R( L s; ~* x8 f7 t6 ~9 M * L) A" C! ~; p0 u9 x5 h$ j
|