Python + pyspider某小说站的爬虫,入数据库,火车头发布,资源下载到本地,另可写爬虫!$ o# \! C+ F+ r
- #!/usr/bin/env python
S& Y6 o& F! G; }3 O - # -*- encoding: utf-8 -*-( u% ^! w& f) V9 C; r- q
- # Created on 2019-05-05 21:43:11
! h+ i/ S Z0 e! {1 H: G0 M& W2 E4 j - # Project: XiaoShuo/ M" m, ^/ e0 H, @" ^6 p
-
9 `2 J$ n& r1 y$ z* `. I4 N - from pyspider.libs.base_handler import *
, R$ V( [6 V2 F! t0 X - import pymysql. Y, a; ]( S! q: R, [4 \
- import random
9 U7 i# r" A' N7 e, X2 O9 f - import datetime
# P4 r6 K X N ?4 v! `7 y: V - import urllib2,HTMLParser,re+ }+ [: @9 L# i; A1 l& ~5 B
- import os7 L. {& G+ M& {; Y
- import sys8 z, P1 g) l- e. g
- import re0 ^& H+ R N0 l+ \# x% Z
- import codecs- Q. w! e5 F/ g7 Z( }
- import requests) L3 @* B+ S# {
- import json* t) t& T3 z; C# C1 ?4 f$ b9 h' S
-
$ N; P! r1 l0 z0 o+ S - class Handler(BaseHandler):& q( i B, C5 g) Q+ _9 e: V
- global Datos# m0 f1 p/ n4 z$ o! e1 c
- global P_dir & i0 k' U' H7 N6 L/ `
- P_dir = '/Tools/Debug/' #采集时候图片保持到本地的路径
$ H0 h; Y' t2 N, }' \2 w4 F - global Datos
" O) L* S" _7 Z, H - Datos = {}
( f1 i, {8 M+ b8 C# `& } - headers= {
. r" c/ k" i" C8 @ - 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
; a2 ?9 g; V/ ?$ V: {5 g - 'Accept-Encoding':'gzip, deflate, sdch',% }0 q e1 Q0 S# {% B
- 'Accept-Language':'zh-CN,zh;q=0.8',/ C' ?) p: ^0 q
- 'Cache-Control':'max-age=0',( i- i: ^ g$ F( L5 \- E0 Z& Q) h: f
- 'Connection':'keep-alive',, A2 t. v( q0 v
- 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36': c+ l0 t M+ I$ Z5 Y
- } a( j- H* f" n
- crawl_config = {0 Y; D) B; _! I; a# n$ T
- 'headers' : headers,
8 E; n3 F7 U$ s- `# }) t+ x" ` - 'timeout' : 300& {: J, N% z5 O/ q: l
- }
6 @& l& t% _7 `: y1 Q2 ^& @ - def add_BookFile(self,Bookname, BookIDs, img, Locaimg, Book_Dates):
1 F5 t, x/ w2 @! N }, n {' k - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
2 @( c/ S# M; t0 u: R S - try:
0 w; W! d1 [/ n/ z" [% q( u - cursor = db.cursor()7 U7 D+ }) X' n
- #注意此处字符串的占位符要加双引号"%s") ~: w1 F( ^& |, u" H: ]
- sql = 'insert into BookFile(Bookname, BookID, img, Locaimg, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, BookIDs, img, Locaimg, Book_Dates);$ R9 W j2 i( [/ }
- # print(sql)% @% ]2 s& f0 O v$ f: T% K c0 z
- cursor.execute(sql)
! w) [! _& p0 D% O" j -
8 q+ r8 r ]; l% }/ n - #qid = cursor.lastrowid
) l( Y/ _& v7 ]- y9 W6 ` - #print(qid)# P y5 j, A: d5 Z9 }+ |6 P. e, Y
- * J8 N( d- {# V2 Q
- db.commit()
1 F6 a+ ?# A( F8 V9 o. P, A - except Exception as err:% s8 H4 n" ?* c
- print("Error %s for execute sql: %s" % (err, sql))% f5 ~9 G1 P% n3 q; T/ u! w" L
- db.rollback()
/ E) [$ W- O% X& z* z5 O) U' o - def add_comment(self,Bookname, Booktitle, BookID, Titleid, Book_Date):8 t8 y7 H. B# y7 K2 I) h6 S: l% G
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")" p7 r- e$ @, |- H, n( v* s
- try:
. O6 f B* {% Q' j' A1 h# K - cursor = db.cursor()
/ R1 |9 @% y* ~9 x( G7 _ j* \ - #注意此处字符串的占位符要加双引号"%s"* l' S8 r* V% N$ l. V' S
- sql = 'insert into BookTitle(Bookname, Booktitle, BookID, Titleid, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, Booktitle, BookID, Titleid, Book_Date);. l- n/ P8 O. ~' ]! Y/ Q ~# R
- # print(sql)
: H" f$ H) y0 @8 E+ X - cursor.execute(sql)9 m: S8 q. o0 T1 }' K1 H# B, t
- , V0 f* e+ M; p H- n
- #qid = cursor.lastrowid8 `$ ?. C/ H! d; m/ `5 M' y
- #print(qid)
p5 m. A) U9 y! z9 W; t% u( Q -
H* L) F$ S; v5 P - db.commit()
, ^/ B0 u0 i/ m8 v) o7 T, l, [ - except Exception as err:$ S0 t4 `) t' g+ y# |% z3 n* r
- print("Error %s for execute sql: %s" % (err, sql))2 j5 |) m2 l0 }- c
- db.rollback()
8 L, ~ V# X% v1 {4 u; L - def add_question(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date):
# g, `2 t' L: S/ r+ X9 C - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
4 A+ h5 }: `5 d0 S - try:+ k& i6 n/ s* {. I% c* m
- cursor = db.cursor()1 Z: ^5 ?! H( C" N# R2 W. l
- #注意此处字符串的占位符要加双引号"%s"
* P. j2 i! Q; H. M - sql = ' INSERT INTO BookConte (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date);
P9 |9 c1 ~% k t: q* c) i - print(sql)
# W: M+ u$ w1 g5 x' L8 c - cursor.execute(sql)' Z h( R+ o4 U4 i
- print(cursor.lastrowid)
: z, y# P8 j- | V- @) r - db.commit()9 A0 g$ d4 \7 k1 O" j# I
- except Exception as err:
: |6 A: m4 d6 ^# M - # except:
. J' g! e9 j$ N7 T. n4 | - # print('Failed')
. Q6 T4 R& F5 X$ t& J' ? - print("Error %s for execute sql: %s" % (err, sql))
$ B: W0 o7 b, P. V1 i - db.rollback()/ Y& l; {/ R6 h
-
: F' Z) m/ T% V - def add_locoy(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover):
( R n# s* r4 {0 ?5 I7 c - reload(sys)# W" z- L6 W7 I# M% V' X9 D/ v
- sys.setdefaultencoding("gbk")
7 @- L5 J) m' c9 \" Y. Q, |; L - locoy_url = 'http://www.******.net/locoy/?my=book' #697火车头发接口地址1 D- a( l1 t6 |' ]3 }2 a) @8 a6 s4 y
- locoy_data = {% ~8 q5 Z( F# @: k4 s$ A
- 'my_u':'用户名', #后台用户名
0 _, f! i' F2 v) E1 l - 'my_p':'密码', #后台密码
9 |5 m! e9 V% T% }3 L4 m - 'subject_669977_net':Bookname.encode('gbk', 'ignore'),% i7 h( ?8 }( k x) W# ]: x9 z
- 'caid':Cater_Name.encode('gbk', 'ignore'),; b0 l# ~2 Q5 e" I0 U" I$ K
- 'title_669977_net':Booktitle.encode('gbk', 'ignore'),4 w0 e# @' S5 T' f
- 'article':BookConte.encode('gbk', 'ignore'),
/ ^+ F w& G! S. w9 W2 }8 \ - 'author':Book_author.encode('gbk', 'ignore'),; F7 G6 }7 {1 c4 U7 ~1 f+ A
- 'ready_1':Book_Palabras.encode('gbk', 'ignore'),
& E7 F/ y5 ]; U0 K - 'thumb':Book_img,8 M+ s/ m) s3 B
- 'content':Book_Introduction.encode('gbk', 'ignore'),
( s6 j% j: C- `9 f% b - 'abover':abover.encode('gbk', 'ignore') ! s6 B% l5 _( {, V: ^1 \
- }
& I$ E3 h3 l, q* `& d- N7 t# L1 ~- E - res = requests.post(locoy_url, data=locoy_data)
* v$ a1 ?& P8 J: y4 q - print res.text+ P* S+ N$ }/ D% f
- print res.content
3 U* N5 Y% N3 }: C" w4 \; b* O - # print Dsd
5 k Q" g9 t6 H3 f - return res
# F1 ~4 ^' C5 j5 ?# b -
, p" e8 p6 m/ Q1 m l: ~' f - def __init__(self):/ P* r4 U5 G8 d1 B4 e7 |
- self.base_url1 = 'https://www.****.cc/', u ~0 _8 u% o+ t. Q: H W: p+ U
- self.base_url2 = '/'5 x* W8 t9 H+ f# d
- self.CaterId = []
2 m) P- y4 V o+ X; j: F* H4 Z; E* U - self.CaterIds = ['xuanhuan', 'wuxia', 'yanqing', 'lishi', 'kehuan', 'kongbu', 'nvsheng']
b v2 n- Z3 e+ A - self.page_num = 1
! g$ C) T: k E5 E- R9 X7 ]) q - self.total_num = 200 $ c1 p+ o0 k, o" c8 M
-
8 b& F6 h: C6 F! \9 t O( r - @every(minutes=8 * 60)7 \ N' p( [0 b$ Z/ w0 x$ b
- def on_start(self):
( N2 H( C% t" Q9 Y - global Cater_Name
$ ]7 G% x7 ~/ X, i2 _ - Cater_Name = []
* [# c4 c+ I! ]7 t - while self.page_num <= self.total_num: 4 |$ m2 A5 C) w/ Y
- for self.CaterId in self.CaterIds:
* M) S# D" `& F! G - if self.CaterId == 'xuanhuan':% p2 B7 z) W1 x
- Cater_Name = '玄幻'+ {! m- q2 \4 G0 p9 b+ c
- if self.CaterId == 'wuxia':
9 v S& D$ D9 k( x6 G+ w7 _3 s - Cater_Name = '武侠'
% S5 k3 Y* P3 D* n* C! h - if self.CaterId == 'lishi':
. s& a! q- A% L( H - Cater_Name = '历史' & R9 m7 R* y1 N* v! w; S' \- S( ]
- if self.CaterId == 'yanqing':
9 n& J3 F8 u @' h - Cater_Name = '都市' 9 T9 F2 Q( `0 i8 m& x
- if self.CaterId == 'nvsheng':& |9 A/ W- J8 z) I( i
- Cater_Name = '都市' : |: s' o% G2 |& [! N- Y
- if self.CaterId == 'kehuan':
J0 w+ S- | E L. N - Cater_Name = '科幻' - V' d; q# K( J
- if self.CaterId == 'kongbu':
# a+ p9 Z8 B9 \$ {( D$ V# V5 P! m - Cater_Name = '游戏' " Y+ V: C8 O0 W0 \
- print self.CaterId
, g8 r: n! h p2 x# ^ - url = self.base_url1 + str(self.CaterId) + self.base_url2 + str(self.page_num) + "/" 7 m7 q) E/ A1 _7 P7 M5 |" i
- self.crawl(url, callback=self.list_Caterg,save=Cater_Name)
% X2 ^1 T0 O- T# u - self.page_num += 1
]! o G& A* b. a - ! {5 z( Z; s- U8 O( Y
- def list_Caterg(self, response):
$ f) k5 K6 M/ o. {% P' J/ z$ K - Cater_Name = response.save
+ \. U8 h' d; J' x ^1 O - for each in response.doc('.pic-list a[href^="http"]').items():& a: e( Y4 @- |) ~, u* `3 ]
- self.crawl(each.attr.href, callback=self.list_Caterg_detail,save=Cater_Name)0 t' P+ `& L; f8 }1 `
-
4 s' _' d$ q0 }$ x" c9 ?8 n - def list_Caterg_detail(self, response):
& `2 w8 [ T; {1 a6 N% U - Cater_Name = response.save+ i X! R' g) b: R: x* D0 R
- # print Cater_Name( o* I$ J+ L5 C. S9 M% D4 f
- Bookname = response.doc('h1').text()5 V7 }; U8 l5 D" J3 c
- print Bookname/ B. E5 G6 d) \" v
- Book_author = response.doc('.authorname > a').text(): _ B! t' P/ t7 ?5 Y* R
- # print Book_author
; C7 ^/ `! L u6 t - Book_Introduction = response.doc('.book-intro > div').text()
& n/ [0 k) i+ k% X - # print Book_Introduction9 K3 v% C+ a( B! S- C' X
- Book_Synopsis = response.doc('b').eq(1).text()
" O8 v9 ^* R5 d, g2 M- [( \ - # print Book_Synopsis
3 X% k }6 o$ W8 m0 z1 K - Book_Palabras = response.doc('.booktitle p').text().split(' ')[1].split('|')[0]
" B2 A0 ^# b ?5 c7 O* t - # print Book_Palabras
/ D" v) G( b! @) f' ^- {; o0 r - BookIDs = response.url.split("xiaoshuo/")[-1].split("/")[0] #小说ID/ P% \5 s' K) V; |6 j# E- K
- # print BookIDs
, b& i7 V5 o* N+ q' w - Book_Dates = str(datetime.datetime.now()) ; O! n, l4 p9 _( {& h
- for imgs in response.doc('.bigpic > img[src^="http"]').items():
+ h1 E2 |' K4 W1 T O; r - img = imgs.attr.src
5 L$ `0 D- k( j( N# @ - print img% J% r. [5 T. z: _5 z# j: |/ {% v
- #小说封面下载
' @, Q: u4 I6 R% g, t - extension = self.getExtension(img)9 u9 w5 E* i; K: u9 u
- name = self.getname(img)+ n" |2 X. T' [) d% v2 g4 y8 x
- file_name = name + "." + extension W! E, g7 f2 n( p
- imgDir = P_dir + name
?# S6 R, x; z( ] - Locaimg = imgDir + "/" + file_name
* j/ D0 K. C \% Y6 X - print Locaimg8 W$ y {" Z @* T0 ~
- if(self.download(P_dir, imgDir, file_name, img)): #这2行可注译,图片下载到本地
+ ~3 |6 y8 \5 H - print('attachment url is ' + img) #" G1 g1 Y% k' c
- Datos = {8 k9 g/ ?% n) @: _
- "Cater_Name":Cater_Name,& s$ m2 P, h/ X6 U: M
- "Book_author":Book_author,. G8 c) p; `/ t
- "Book_Introduction":Book_Introduction,
! [ W& R) y/ D, P - "Book_Synopsis":Book_Synopsis,
k4 F! \8 j7 t - "Book_Palabras":Book_Palabras,
* R" ~" i+ s& Q9 t( d/ R( i - "img":img,0 K# n0 e* P4 t4 m
- }
f: }3 B5 F, N+ G - self.add_BookFile(Bookname, BookIDs, img, Locaimg, Book_Dates) #这行可注译,数据库发布接口,方便其他系统的发布$ C2 j' I; `; R3 p) `. g u) N; D
- for each in response.doc('div[class="bookbtn-txt"] a[class="catalogbtn"]').items():8 M8 ?7 z/ [5 ?
- self.crawl(each.attr.href, callback=self.index_page,save=Datos)2 a! T7 |3 Y& o! I( D
-
6 Y$ @3 K3 Y2 z! [. O - @config(age=8 * 60 * 60)
; T" p7 n- q+ l$ f0 J4 T5 Z8 i! Y - def index_page(self, response): & @ {; E( O" `1 d- w& H$ O( n
- Datos = {, c8 Q7 D5 H0 P# B# l; r: o& p
- "Cater_Name":response.save['Cater_Name'],
8 P& F/ r2 t% p/ U, B, L - "Book_author":response.save['Book_author'],
9 c2 L- g- n$ ~- x) e' {' p - "Book_Introduction":response.save['Book_Introduction'],. F# V6 T- ? z
- "Book_Synopsis":response.save['Book_Synopsis'],
( f5 n8 k7 k: v2 k: f - "Book_Palabras":response.save['Book_Palabras'],& T ?" H2 k0 w( S4 E# t1 p R+ r
- "img":response.save['img'],$ A8 P. U/ S, S5 r) p: n* e% a
- } P1 A) F" m% _4 I
- for each in response.doc('.chapter-list li:first-child a[href^="http"]').items():
; l( N- h% o ^4 h' b9 l/ k2 g5 | - # for each in response.doc('.chapter-list a[href^="http"]').items(): " P4 v2 i/ X$ Z& J2 ^
- self.crawl(each.attr.href, callback=self.detail_page,save=Datos)
5 o3 A/ V; x/ P" h6 V - @config(priority=2)+ X5 K$ I( Y$ _
- @catch_status_code_error
. R* h0 R+ p L; f/ ^) H2 _ - def detail_page(self, response): 0 O- U$ b( `0 ]. ?& [
- NewRe1 = u'哈书'
3 u5 n$ y+ V% C. s k - NewRe2 = u'huhjsd.CC'$ ^" D- }& X7 R, C5 N# L% h& @5 U
- NewRe3 = r'^\\n\\n'0 x& g* k4 g- O, w
- NewRe5 = u'小说网'
+ x$ @' Z/ j+ P) ?7 M9 } - NewRe6 = u'fgdfgf'! A) C, h% b8 P3 i! I: d: `
- NewRe7 = u'fgfgf'
, }5 `/ ~' J4 ^. R - NewRe8 = u'ffhgf'
9 D/ X7 T7 B; l' {/ [ - NewRe4 = r'[\f\t\v+\.\{\(\)\}\!\/_,$%^*(+"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+'' Z; d$ s, Z8 ~. P. c5 h! A
- ReC1 = u'静思'1 y5 {' j- y; Z q
- ReC2 = u'aghgf.com'3 s% t. U1 _0 C# o9 a; q+ c$ E
- ReC3 = u'aghgfh.com'* C* R' ?/ O9 {9 ^+ {, v3 x8 z
- ReC4 = u''
7 V+ C. n! O- C. q8 D+ | - ReC5 = u'文学网'1 ~' S! ?3 U1 |- P2 O# {2 s, X
- ReC6 = r'<BR>'9 | {$ W7 w; b& K. s A
- Bookname = response.doc('.readlocation a').eq(2).text() #小说名称& ^- `+ J0 Y8 G" [. |( l
- print Bookname
+ K+ X1 L- `1 b) ~1 B7 P - Cater_Name = response.save['Cater_Name'] # 小说分类
j3 H m+ y; s+ G. I' K - Book_author = response.save['Book_author'] #小说作者2 v' j3 g9 q+ N( O: A+ B3 t
- Book_Introduction1 = response.save['Book_Introduction'] #小说简介
5 c4 `2 Y2 j' v% W - Book_Synopsis = response.save['Book_Synopsis'] #最近更新
5 @* K- ^; k. J0 `0 s - Book_Palabras = response.save['Book_Palabras'] #小说字数' s" A( b& `, w5 y* f( x8 D
- Bookurl = response.url #小说网址
+ H H4 t) T. M" X6 v3 g - Booktitle = response.doc('.article-title').text() #章节名称
) b* A: ^0 I- e1 t - BookID = response.doc('.readset-r span').text() #小说ID
8 a0 g# H, K; ~- D+ d* P9 b | - BookConte1 = response.doc('.article-con').text() #小说章节内容
6 K1 X' S9 E t1 I - abover = response.doc('.article-title').text() + response.save['Book_Synopsis'] + response.save['Book_Palabras'] + response.save['Book_Introduction'] #小说状态(连载还是完成)7 i, [, h" D9 C8 ?5 d: a
- Book_Date = str(datetime.datetime.now()) # 采集时间
H4 r) v3 [% F6 h - BookConte2 = BookConte1.replace(NewRe1 , ReC1)
' b2 j4 S# y9 C - BookConte3 = BookConte2.replace(NewRe2 , ReC2)0 D R1 ~# Z% q1 D
- BookConte5 = BookConte3.replace(NewRe5 , ReC5)
. J+ [# ?' I& Y2 @" ~' A - BookConte6 = BookConte5.replace(NewRe6 , ReC2)
, l, H) ^, O5 m% h: b1 @ - BookConte7 = BookConte6.replace(NewRe7 , ReC2)
8 @- d; q' [, ]8 S$ g L5 e( r) E - BookConte8 = BookConte7.replace(NewRe3 , ReC6)9 S5 Z9 a, X+ y
- BookConte4 = re.sub(NewRe4 , ReC4 , BookConte8)
" F1 p y4 r9 D# y+ w - BookConte = BookConte4.replace("\n\n","<br>")! |( Y e3 V; r2 r+ o0 ?# `' v
- print BookConte0 v1 ?* [+ E' f" F5 {2 e3 Y/ Z
- Book_Introduction2 = Book_Introduction1.replace(NewRe1 , ReC1)
3 o/ N; M: B" W/ p - Book_Introduction3 = Book_Introduction2.replace(NewRe2 , ReC2)
2 V! `: b& M7 _* V# Z: G - Book_Introduction4 = Book_Introduction3.replace(NewRe3 , ReC3)3 N$ s9 T0 \7 x+ c5 A7 r1 Z
- Book_Introduction = re.sub(NewRe4 , ReC4 , Book_Introduction4)5 q) c; T: k# z! }: K3 K; d3 R- a
- Titleid = response.url.split(BookID + "/")[-1].split("/")[0]
6 q# D/ K u2 y# B# n6 k. N) { - Book_img = response.save['img'], #小说图片2 A/ f) C J2 }6 l" G" j! v
-
7 i0 C7 S" ^6 L% |- G2 }7 g' d h& ] - #insert into MySQL 小说入库7 e# [, v4 ~) _" t; X8 Q
- self.add_question(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布3 ~) o8 f: y! i, e. I' |, D
- self.add_comment(Bookname, Booktitle, BookID, Titleid, Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布% C3 t6 W9 X2 n
- #post提交发布
) q) d& O/ Q* @* c/ W1 _ - self.add_locoy(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover) #这行可注译,火车头发布接口,不需要可取消
) y, Y/ g3 M# J3 k+ Q - Datos = {( k0 e1 n8 N9 {6 X) J+ `
- "Cater_Name":response.save['Cater_Name'],
' \8 N* h! I& i) w. q - "Book_author":response.save['Book_author'],
/ K/ B+ y' u) f! w! W) c2 d - "Book_Introduction":response.save['Book_Introduction'],4 V0 R6 _0 a- J( [) A- c
- "Book_Synopsis":response.save['Book_Synopsis'],; p1 L- ` z+ ]3 }" m; R8 }8 R- ?
- "Book_Palabras":response.save['Book_Palabras'],
) D; p6 T/ A' Z+ n3 e: } - "img":response.save['img'],
4 Y6 E( e0 k3 i L$ V - }6 E+ d4 V% K5 G0 H$ ~7 u" v
- for each in response.doc('.articlebtn > a:nth-child(4)[href*="/xiaoshuo"]').items():
8 N" \6 j1 I8 L! R5 H+ A6 D( I - self.crawl(each.attr.href, callback=self.detail_page,save=Datos) ) t# {, s. W8 s/ u( u+ g
- return {+ D7 @1 |( P) j: Z6 g2 D
- "Cater_Name":Cater_Name,
0 J* t, h! Z) Y# C" ^! ` - "Bookname":Bookname,) E- H! i6 V: W
- "Book_author":Book_author,6 Q2 ]' o* d2 V4 {/ i8 \: u
- "Book_Introduction":Book_Introduction,4 a/ [7 f, A* C0 p) y
- "Book_Synopsis":Book_Synopsis,
! \, [) ?- G/ v - "Book_Palabras":Book_Palabras,4 z& j& ]$ w- G# w% U: U
- "Book_img":Book_img,
( v* w/ K3 t- L, v) p5 ` - "Bookurl": response.url, q5 X- y' v: k2 \% J& ^! x1 m
- "Booktitle": Booktitle,
) P( P/ f/ P5 _% R3 T+ F - "BookID": BookID,/ M3 F1 N& q. i& f2 D1 {7 |9 H9 m
- "BookConte": BookConte,
% M% y ^- K. n. ^: ^ - "Titleid": Titleid,
/ T/ t, S2 {' _ [- T7 \ - "abover":abover,* D1 D' C3 k! m+ C' r' q
- # "Book_Date" = str(datetime.datetime.now()),
; _& g- i9 v1 Y5 ] - }
* J7 P. _1 y" l& V7 R" | - def download(self, P_dir, imgDir, file_name, Book_img):) r% W6 e# n- {4 T* i6 U
- if not os.path.exists(imgDir):
- U v3 d9 _% p1 E8 f - os.makedirs(imgDir)" G& _9 E9 i3 A6 x1 L" n# e$ F
- file = imgDir + "/" + file_name
: K" A L% {' b. n# p/ M8 A- ^ - # print file
7 j H( y: S. S0 M3 K h8 f - f = open(file, 'wb+')
9 e; g' R2 [; a/ w - imag = requests.get(Book_img) 8 b4 f( h5 D& Q# \
- f.write(imag.content)" W; c( S) N* ^5 m0 B x/ G: B
- f.close()
i# N: `& L0 f - #保存图片前4 Y5 @+ m3 Q$ I) c" C0 j3 y
- def save_imgs(self,response):
. p+ |2 n4 N% n( J- e - content = response.content+ H1 c6 |6 I+ Z4 k
- file_name = response.save["file_name"]
! j* j! E @, u. q6 A# g - imgDir = response.save["imgDir"]
8 d* T* r, }3 m7 i - file_path = imgDir + file_name8 @* X0 |9 t7 K- P+ }+ Z
- self.save_img(content,imgDir,file_path)
- B/ Y4 w: B2 W9 ^0 v - #保存图片
. r1 w7 d/ c( e9 K/ C" i - def save_img(self,content,imgDir,path):+ a! A+ H" N2 C; W+ y0 b
- if not os.path.exists(imgDir): " C% P+ H% S* r; a7 u% n
- os.makedirs(imgDir)
h# c. g: n2 M! N - f = open(path,"wb" )# M" J8 K1 m/ V( s
- f.write(content)% }& @; L4 D/ o8 m6 V
- f.close()
* v, B. v) `& u+ d2 n - #获取url后缀名
% V8 {1 ]7 P t& D e - def getExtension(self,url): ( I- p( Z3 N* G; z; k
- extension = url.split(".")[-1]3 p4 S5 h+ M0 ?7 I9 e; F" t' t
- return extension ) z% c- B( E; R K9 S
- 5 Z! H! C0 J. o$ l
- #获取图片名
+ B3 d# V6 Y. `9 O( a/ `; M6 t: E - def getname(self,url):; v6 u" u5 _/ J3 |1 `1 d
- name=url.split("/")[-1].split(".")[0]" q. @3 |; ]1 ]2 k: c( U# z* B
- return name
复制代码
2 \( [ n9 }; Z/ ^9 R9 e - L9 Z& [8 ~5 |( l( h$ H
|