Python + pyspider某小说站的爬虫,入数据库,火车头发布,资源下载到本地,另可写爬虫!
8 Y3 G5 N. F" O$ h; w- #!/usr/bin/env python
/ y; V1 ]) V. t - # -*- encoding: utf-8 -*-- g$ _* R7 d- Q) {) O
- # Created on 2019-05-05 21:43:11
1 I" G* Y( e( x - # Project: XiaoShuo1 q; H/ a! r+ ]; h& @
-
8 Q: w1 E7 i5 p& A+ A; V - from pyspider.libs.base_handler import *, d0 T) E- i" Q W! S# d' R
- import pymysql
' V3 k8 ~7 L% J7 a9 V4 a5 m9 J9 e - import random
h. @. O: j/ G$ c/ q - import datetime
3 S' r/ g, d# q0 d - import urllib2,HTMLParser,re
5 t* t% p+ D- d V6 q3 g4 C$ F* h - import os' H8 T6 H* A: Z+ ~7 c# q1 b: l9 i: ^
- import sys
, `- W" o6 z8 R$ n - import re
+ c' d. _3 M' l# `1 ~" v$ ~ - import codecs
! a# `; L% ?' E- P: B - import requests
_) Y1 O8 A& n7 w: z - import json
+ C7 [5 f, c" B A% ~: [0 E - ! I5 ?1 O6 P/ q1 R
- class Handler(BaseHandler):
) L# i; V' T0 ?) [ }9 E - global Datos
1 w" G9 K; k+ l( z& b) B - global P_dir " f% P7 B) x! t, }
- P_dir = '/Tools/Debug/' #采集时候图片保持到本地的路径
% l4 p$ b) {% A - global Datos8 ^; S0 G+ Z# t% y( A5 X' @8 g
- Datos = {}
4 x7 X% G6 ]/ K - headers= {4 L0 [- d' ~' `7 ~; s& s
- 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
' O2 |0 q3 l% o7 c: ] - 'Accept-Encoding':'gzip, deflate, sdch',$ I0 |2 B" ]" W4 p' n
- 'Accept-Language':'zh-CN,zh;q=0.8',$ M0 p% M5 ?$ @/ {$ U* \* R% J
- 'Cache-Control':'max-age=0',
3 p6 V( `* C8 v }! V2 m - 'Connection':'keep-alive',' L+ w0 o% J$ Q$ C% @0 ]2 R
- 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'( |. s. l% {3 Z
- }% S( e% T' K5 k
- crawl_config = {
& `! s1 H& ^& Q$ M" A' o - 'headers' : headers,
3 n! B$ C; ?3 n" m" t7 ] - 'timeout' : 300+ n9 {* X' t4 p) M# q% Q! P" Z
- }$ T* S1 G- D. a! R
- def add_BookFile(self,Bookname, BookIDs, img, Locaimg, Book_Dates):" }5 X( t2 O* T6 u$ g& ?1 e
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
( }) S3 H, K& _9 S5 f) b0 m - try:) \) N: o' L1 l' Q2 t ]4 Y6 E
- cursor = db.cursor()7 a h. j" i/ g' `/ T2 r% K4 K
- #注意此处字符串的占位符要加双引号"%s"
4 T( y+ ]/ i V: | - sql = 'insert into BookFile(Bookname, BookID, img, Locaimg, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, BookIDs, img, Locaimg, Book_Dates);
; P+ S' x) @9 y6 ?2 X [# q# r- N - # print(sql)
: [4 J2 e* |$ f: y5 m+ {; ~, N - cursor.execute(sql)6 _1 j+ l0 s0 j- [
-
1 n3 P o' D1 ?. J, m - #qid = cursor.lastrowid0 O# i6 s. d7 k0 V
- #print(qid)3 C0 b5 G& g- x+ U; t
- 8 n2 `2 x9 I* y' \0 p9 Q1 M- i& d: e4 f
- db.commit()
# r6 o9 f5 f: M+ ] - except Exception as err:
& H @9 H; ]' u - print("Error %s for execute sql: %s" % (err, sql))
4 ?3 v E; B* Z+ b' Q! g8 j) y5 _ - db.rollback()
2 U) J7 C. l. z8 ` - def add_comment(self,Bookname, Booktitle, BookID, Titleid, Book_Date):
1 e4 G# o% y% F r' Z - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
% A+ i( X. K$ Z/ f1 F; Y - try:
1 U4 y; I8 k# u0 ~1 ]) n& [ - cursor = db.cursor()
5 k" |7 a; D3 b8 m: c8 q3 @ - #注意此处字符串的占位符要加双引号"%s"
5 A3 [ w, H8 s - sql = 'insert into BookTitle(Bookname, Booktitle, BookID, Titleid, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, Booktitle, BookID, Titleid, Book_Date);
! J* F ?% P' A9 _& b - # print(sql)
$ l- E7 P1 h: E, J% Y, J - cursor.execute(sql)0 N2 w) ~$ M& l5 X5 N5 W
- $ C4 Q& U$ e* U k+ h. Q8 J
- #qid = cursor.lastrowid
7 h ]3 X$ U( O! G - #print(qid)6 }4 h; r1 `9 J% t& _
- " Q7 Q% X( b Q& B! P- A8 Q
- db.commit()0 V) N4 j) [& v/ y9 F3 ~% M7 B) h
- except Exception as err:, Y. e* F) n1 A% L$ X
- print("Error %s for execute sql: %s" % (err, sql)) d" [8 z) x0 e6 V7 l0 c; X
- db.rollback()7 H9 m: L5 S. I' a0 c/ a' }
- def add_question(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date):5 L( ]6 H: ^. X5 K/ N. Z
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")- G0 }8 x5 E5 P$ g6 J
- try:
9 U9 [; Q( R$ T1 j9 ?8 B( N7 { - cursor = db.cursor()
3 L' e. T: D2 o - #注意此处字符串的占位符要加双引号"%s") F3 d _7 _/ T1 R7 M6 S# i
- sql = ' INSERT INTO BookConte (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date);8 `- ~1 Z: |, {, o. ^/ B
- print(sql)8 \/ j Y4 Z* X i) p! w! }
- cursor.execute(sql)
* j4 w: Y* @2 t" g$ |0 X2 ?2 P" } - print(cursor.lastrowid)1 P; S9 v* i8 L3 R, |
- db.commit() v7 Z" E5 `. S: }
- except Exception as err:
' V, i% T" C5 I4 k% H1 v - # except:! K% z2 X+ I% H+ Q4 y, w4 V
- # print('Failed')4 `/ I) v4 t" f
- print("Error %s for execute sql: %s" % (err, sql))
& q/ q. z5 J6 M; \; |+ j - db.rollback()- k9 c) K. Q6 W; v
-
* L, S1 \$ f8 ^' n' \ - def add_locoy(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover):
, v" e+ C. `4 h0 a* R4 K# a% ] - reload(sys)2 [7 y h* l. F6 S- l
- sys.setdefaultencoding("gbk")5 V4 i9 X" V6 @ u
- locoy_url = 'http://www.******.net/locoy/?my=book' #697火车头发接口地址5 n0 C# v: f' f; m4 K1 X
- locoy_data = {$ \8 [9 y, P# H* M
- 'my_u':'用户名', #后台用户名7 ~! F2 W! S) y: Y# u0 `
- 'my_p':'密码', #后台密码
7 z. q" t* I* \. v0 \ d( K7 | - 'subject_669977_net':Bookname.encode('gbk', 'ignore'),6 N+ f; y* k. w, F: t5 }) n
- 'caid':Cater_Name.encode('gbk', 'ignore'),9 O$ G0 |9 L# s) |8 }1 d( r4 L& ]- l
- 'title_669977_net':Booktitle.encode('gbk', 'ignore'),
/ l' _! K4 s& j/ S& T - 'article':BookConte.encode('gbk', 'ignore'),8 s% B# K4 i) Q& J/ U* U
- 'author':Book_author.encode('gbk', 'ignore'),
u+ G) d, m2 H* D - 'ready_1':Book_Palabras.encode('gbk', 'ignore'),# X* I }2 ^9 J: \; f' n
- 'thumb':Book_img,+ W- G. J- w) P0 B U: C6 \( M
- 'content':Book_Introduction.encode('gbk', 'ignore'),
5 C2 T' ]* g1 g5 B! r - 'abover':abover.encode('gbk', 'ignore') , z9 ^8 X6 r+ ]
- }
0 m9 ~4 H1 o8 G! X1 `( t - res = requests.post(locoy_url, data=locoy_data)( O0 E5 @1 o0 b) [( Y! B8 m
- print res.text6 m# U1 {& l" q/ q( W2 Y
- print res.content' W' l l1 W: t/ v
- # print Dsd! k/ X# N+ ^$ n1 C' e1 N- Y) u; [) l
- return res
! e0 }& r! | S2 h2 ]7 F1 Q - 5 p( ^: I& L" c Z1 X
- def __init__(self):
) y) h; c, C( m& {$ U' [ - self.base_url1 = 'https://www.****.cc/'3 _ Z+ v' x9 C; C
- self.base_url2 = '/'
. j" a, c V1 i* M8 G; \% f - self.CaterId = []
3 F! _5 l& M3 q% W# |! b - self.CaterIds = ['xuanhuan', 'wuxia', 'yanqing', 'lishi', 'kehuan', 'kongbu', 'nvsheng']
7 K) x2 Z+ h- } d& w6 v4 w0 A - self.page_num = 1
" e7 c+ g; D0 _9 X; u+ k" i - self.total_num = 200 0 q# w3 P$ e* @1 d% a
- 1 C7 T p: ^: G' R6 K- z/ v' ^% U4 P4 r4 {
- @every(minutes=8 * 60)9 W$ c8 p) T$ x/ c4 }+ ?0 c
- def on_start(self):
9 X+ }/ h$ H6 J! f - global Cater_Name
0 ^1 r# O1 G& }# B - Cater_Name = []3 f4 H" Z, \ D; B8 }" B/ B
- while self.page_num <= self.total_num: 2 k% @4 C; J6 s/ {8 j3 c$ B
- for self.CaterId in self.CaterIds:* q) M; r: i+ n7 [0 [0 G
- if self.CaterId == 'xuanhuan':
1 S* { C0 ]% W$ k - Cater_Name = '玄幻'
- e% H) M+ [- L/ K/ a p - if self.CaterId == 'wuxia':0 \/ S5 ^/ _! E: ?( X3 j/ E0 D+ g
- Cater_Name = '武侠'. h9 x& s3 }' j6 `- q# h
- if self.CaterId == 'lishi':# K9 F" n7 j7 t$ O8 h1 A
- Cater_Name = '历史' 4 B0 K# R: \+ {* }1 j
- if self.CaterId == 'yanqing':
) j1 g' `3 m2 `) N. Q6 Q - Cater_Name = '都市' + L7 z5 ~6 R% M- _5 N$ P
- if self.CaterId == 'nvsheng':
6 ]8 t" I, q% T' {' x - Cater_Name = '都市'
* c( O. Y) ~4 f7 @$ ]6 E - if self.CaterId == 'kehuan':
' \ H# Q" w1 k' D6 o1 t; ` - Cater_Name = '科幻' 9 b3 o( k5 W8 [1 \: |
- if self.CaterId == 'kongbu':& i2 p" s9 x5 ^8 k
- Cater_Name = '游戏'
9 T; U/ y$ a$ w4 ?4 M! ] - print self.CaterId2 }$ Q# K& \" ?: e# i" Q
- url = self.base_url1 + str(self.CaterId) + self.base_url2 + str(self.page_num) + "/"
8 l# n# Z8 `0 |$ n6 c - self.crawl(url, callback=self.list_Caterg,save=Cater_Name); \2 b1 c; N7 W
- self.page_num += 1 9 z" C+ F* M5 v. m& C9 I5 f
- 1 r; w: {+ |6 z8 e; H" z( H3 u6 _
- def list_Caterg(self, response):- F) g" Z$ J8 c2 ^" h4 V$ P
- Cater_Name = response.save, L |& n8 z8 _% M! l
- for each in response.doc('.pic-list a[href^="http"]').items():
' a( A; M( D3 v) G - self.crawl(each.attr.href, callback=self.list_Caterg_detail,save=Cater_Name)) T. c7 A/ {. i
- , U0 n8 f+ W+ O; k5 U. `3 N$ V
- def list_Caterg_detail(self, response):7 y( a1 f, l/ O1 R
- Cater_Name = response.save
0 o+ m7 j; ^$ `" |: ` - # print Cater_Name
/ Z# K( d5 P- @: R+ t - Bookname = response.doc('h1').text()+ o& d. x f0 x
- print Bookname
. }5 M' W2 N- ~' m- [0 `5 x5 { - Book_author = response.doc('.authorname > a').text()# i: X* ]- D) u# ?/ a
- # print Book_author
- A) x6 Q/ A2 z! `+ ?1 P - Book_Introduction = response.doc('.book-intro > div').text()
2 [6 w# Z. z, N! t7 P- z/ S - # print Book_Introduction
0 U1 m, F* e2 ~+ o - Book_Synopsis = response.doc('b').eq(1).text()2 O( x' K |0 M5 F8 {+ H
- # print Book_Synopsis
5 {2 B2 U- W( Z - Book_Palabras = response.doc('.booktitle p').text().split(' ')[1].split('|')[0]$ ?" T: p7 _* M, H8 J2 j9 |
- # print Book_Palabras @( h! b% V/ C' J) v+ d
- BookIDs = response.url.split("xiaoshuo/")[-1].split("/")[0] #小说ID& h/ I2 P- }% }9 Y
- # print BookIDs
* N0 J2 ^& B) \3 ?# H2 z - Book_Dates = str(datetime.datetime.now())
# O( @' h- a4 c; T, ~3 U Q( d6 b, i - for imgs in response.doc('.bigpic > img[src^="http"]').items():
4 t. y8 m0 H& h; {% p/ g: ? - img = imgs.attr.src( S# G O/ M% k: w
- print img8 ]+ i, J) t; s- R- W+ @8 M/ M
- #小说封面下载
( }$ c- c& z. T% P - extension = self.getExtension(img)8 g2 Y+ d1 B* g j/ i, t. B
- name = self.getname(img)
4 A, m$ ?7 v& q: r6 p; _! H, W8 F - file_name = name + "." + extension) K1 U- `2 Z+ h
- imgDir = P_dir + name
4 V7 q& J, H5 T6 e' {4 j& M6 d- i - Locaimg = imgDir + "/" + file_name1 V% f* r7 X9 J9 l. V' }/ \# p
- print Locaimg6 a" Y' w! f5 d# U5 R- x
- if(self.download(P_dir, imgDir, file_name, img)): #这2行可注译,图片下载到本地
. ^) R' t- q# W$ q - print('attachment url is ' + img) #7 `. S4 A1 o; c- h# |& J* z
- Datos = {
/ d6 O: s6 R* k) J/ L9 ~ - "Cater_Name":Cater_Name,1 H8 {/ B% {: w- B: U
- "Book_author":Book_author,* m2 t( E" ^/ i" k! d0 X
- "Book_Introduction":Book_Introduction,
7 H5 O+ M$ F( r - "Book_Synopsis":Book_Synopsis,
5 t- Z, u* K g' { - "Book_Palabras":Book_Palabras,8 q) Q$ ~8 H% C( w' y" v
- "img":img,7 [; d( y3 e1 s( N% Z) r9 [2 ^2 [
- }4 h9 O F3 [# B5 e) X" A8 p, |
- self.add_BookFile(Bookname, BookIDs, img, Locaimg, Book_Dates) #这行可注译,数据库发布接口,方便其他系统的发布- w- i4 m/ E; _ D
- for each in response.doc('div[class="bookbtn-txt"] a[class="catalogbtn"]').items():3 Z; Z1 K* A$ b( c' d0 ^
- self.crawl(each.attr.href, callback=self.index_page,save=Datos)
. [/ p8 U' G3 y+ z: d - 9 E' t% F8 P' l: @9 A4 K" _
- @config(age=8 * 60 * 60)
- K- H- I6 O+ J; v - def index_page(self, response): 7 d) H3 d! Z; H
- Datos = {
- ~( V! L8 G. d" O+ ] - "Cater_Name":response.save['Cater_Name'],( u' J, ~; |7 D4 ~, ~( T
- "Book_author":response.save['Book_author']," {7 B8 W Z7 r! x4 w4 I l
- "Book_Introduction":response.save['Book_Introduction'],$ n$ o; u4 x$ J7 R% w( {- P
- "Book_Synopsis":response.save['Book_Synopsis'],
2 g3 O) j! C& }' p8 p9 i: A' u - "Book_Palabras":response.save['Book_Palabras'],/ Q+ K' y5 Y( H$ a
- "img":response.save['img'],# U/ y l) s6 o, |7 R# n/ H. ~+ G) p
- }
6 _7 E* K9 o# p4 O& r - for each in response.doc('.chapter-list li:first-child a[href^="http"]').items():
! H' A' K4 M5 r( f - # for each in response.doc('.chapter-list a[href^="http"]').items():
4 u! e- p6 D8 C* v* }8 J1 E - self.crawl(each.attr.href, callback=self.detail_page,save=Datos): c8 g, G( w. d, u$ C8 ~4 E) K
- @config(priority=2)4 ~8 U9 ]* ^" K9 F. u; k
- @catch_status_code_error1 {, d9 e, o; o/ }& \9 Q
- def detail_page(self, response): . q) q/ S( N, e; n# [
- NewRe1 = u'哈书'2 R! f+ E) Y" h+ U
- NewRe2 = u'huhjsd.CC'
6 R, I) P1 N9 h2 S# S8 ]% F6 G) R - NewRe3 = r'^\\n\\n'+ i1 G3 ^! O3 F, H
- NewRe5 = u'小说网'
5 g" e8 A8 f; H" j - NewRe6 = u'fgdfgf'' V# ]! W, ?/ ]' `
- NewRe7 = u'fgfgf'
D ^" C" w( Y' Y - NewRe8 = u'ffhgf'4 W6 A+ n# C* R; K! N9 M8 ]
- NewRe4 = r'[\f\t\v+\.\{\(\)\}\!\/_,$%^*(+"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+'' s0 z- X; g8 h& X# e4 @# r
- ReC1 = u'静思'& u8 V+ o d' Y+ v! o4 _
- ReC2 = u'aghgf.com'
+ K0 ^( R4 i/ W& ]+ K% A9 G; `6 ]/ @ - ReC3 = u'aghgfh.com'" B; h O5 f. `/ M b( O/ X
- ReC4 = u''; S& L4 L) U0 h- G$ C6 V# d
- ReC5 = u'文学网'
8 I9 B4 b1 X4 ?* X2 I7 K - ReC6 = r'<BR>'2 C3 w9 }7 y$ @ ?" ]1 |' v; T
- Bookname = response.doc('.readlocation a').eq(2).text() #小说名称- f' x& ~8 x( ^' ~7 }: ]/ ]$ g& e
- print Bookname8 D6 F, Q8 n& @5 [5 X/ ^9 d8 `: w
- Cater_Name = response.save['Cater_Name'] # 小说分类3 Q) h$ a3 d) f0 U; x; c
- Book_author = response.save['Book_author'] #小说作者
- @- h. a' a; x - Book_Introduction1 = response.save['Book_Introduction'] #小说简介( n# v, X- O# I0 {
- Book_Synopsis = response.save['Book_Synopsis'] #最近更新1 U) [) E2 ^$ v/ L
- Book_Palabras = response.save['Book_Palabras'] #小说字数9 H2 n- y: O! R0 E$ R
- Bookurl = response.url #小说网址
6 N8 [+ ]! K/ A - Booktitle = response.doc('.article-title').text() #章节名称
5 ~: E& W1 h$ j - BookID = response.doc('.readset-r span').text() #小说ID' i; x, n0 {" p$ U, F; ]: {
- BookConte1 = response.doc('.article-con').text() #小说章节内容, g3 v5 N' J: A1 y
- abover = response.doc('.article-title').text() + response.save['Book_Synopsis'] + response.save['Book_Palabras'] + response.save['Book_Introduction'] #小说状态(连载还是完成)
& `. \$ c& D8 x; R+ m - Book_Date = str(datetime.datetime.now()) # 采集时间( k! l$ S+ u1 O1 H) a' g+ V
- BookConte2 = BookConte1.replace(NewRe1 , ReC1)! C C, `9 r' Z8 D/ O9 Z
- BookConte3 = BookConte2.replace(NewRe2 , ReC2)
+ u9 t5 n, N* w0 C+ `" B - BookConte5 = BookConte3.replace(NewRe5 , ReC5)+ U1 _: d% v3 Q' l x
- BookConte6 = BookConte5.replace(NewRe6 , ReC2)) g! A* y; T( V" N1 S, a
- BookConte7 = BookConte6.replace(NewRe7 , ReC2)
: @$ _( L# e3 H7 V3 x* D - BookConte8 = BookConte7.replace(NewRe3 , ReC6)% F6 o% d! \0 c1 w4 D ]
- BookConte4 = re.sub(NewRe4 , ReC4 , BookConte8)6 P! t9 z4 s4 O" @0 G
- BookConte = BookConte4.replace("\n\n","<br>")
! X; j( j/ x' \ F$ P% c) e - print BookConte
" E8 ^3 N; {, l - Book_Introduction2 = Book_Introduction1.replace(NewRe1 , ReC1)4 F. G( X+ p2 P3 D3 N( r
- Book_Introduction3 = Book_Introduction2.replace(NewRe2 , ReC2)
& S; e- P8 O, z" k ~0 B9 C - Book_Introduction4 = Book_Introduction3.replace(NewRe3 , ReC3)
; Q7 { O- X. l* Z" A. d9 w9 w - Book_Introduction = re.sub(NewRe4 , ReC4 , Book_Introduction4)( B: N) O! ^/ n* S) ^
- Titleid = response.url.split(BookID + "/")[-1].split("/")[0] + V4 {/ r4 c" O$ ], z6 r
- Book_img = response.save['img'], #小说图片5 A! y1 k9 \8 g9 |8 z0 H- K
- ! A7 [ r0 d5 L$ g$ t
- #insert into MySQL 小说入库" T, v7 B0 k+ w2 S7 y
- self.add_question(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布' e, a( N8 a; e8 M! v
- self.add_comment(Bookname, Booktitle, BookID, Titleid, Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布
. X& }4 M: A3 R2 j& R+ J - #post提交发布% p! t! R1 \" P% | L! u, }
- self.add_locoy(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover) #这行可注译,火车头发布接口,不需要可取消* K' j3 f' C( j; e5 _8 J
- Datos = {* d* M& r; G& \9 m' C1 H+ p
- "Cater_Name":response.save['Cater_Name'],
~2 P! t: P& W% d - "Book_author":response.save['Book_author'],, o0 T7 W& z& l
- "Book_Introduction":response.save['Book_Introduction'],
' N% Q( p9 P6 c$ A* b - "Book_Synopsis":response.save['Book_Synopsis'],
3 J& d. o4 {9 U - "Book_Palabras":response.save['Book_Palabras'],
1 T4 S4 z. Y: C6 g, c. u - "img":response.save['img'],5 l8 W3 Y0 I* v Z" C& h
- }3 e% N! m7 G* E! K. O7 ~
- for each in response.doc('.articlebtn > a:nth-child(4)[href*="/xiaoshuo"]').items():9 a$ w+ K8 G" g
- self.crawl(each.attr.href, callback=self.detail_page,save=Datos) 6 ]1 p9 [4 n, t" t: [# a
- return {, d X1 Z1 L3 b$ x" T, l
- "Cater_Name":Cater_Name,
% q+ C" }( t$ u/ K- b. }1 E, L- s - "Bookname":Bookname,7 ^& D( H. a6 z! }( h- |+ y M
- "Book_author":Book_author,) `0 p+ f9 H. [; ?% P0 b, b
- "Book_Introduction":Book_Introduction,
! f% S6 ] C8 k7 o - "Book_Synopsis":Book_Synopsis,$ q2 |, y4 h+ l. r2 p, r
- "Book_Palabras":Book_Palabras,
6 N7 \* A F* Q0 Z* I) f - "Book_img":Book_img,
6 o: B) |5 S) F - "Bookurl": response.url,, J' ^2 s9 c3 j* W
- "Booktitle": Booktitle,- }" P1 g+ h ^$ y$ {! n$ w0 C& s1 {) P
- "BookID": BookID,
! }2 A0 S9 R# O1 N V7 @# Q - "BookConte": BookConte,
7 l5 n, Q) p8 G7 i5 i - "Titleid": Titleid,
& P- p1 Q$ i- e' F0 J5 L - "abover":abover,# h6 Y% m/ [% k1 T" N+ z
- # "Book_Date" = str(datetime.datetime.now()),
* M$ S+ _: M3 B( x4 | - }
* H& W# |: C) f8 p - def download(self, P_dir, imgDir, file_name, Book_img):
! ~$ [0 N, g1 V& g0 j% |7 j - if not os.path.exists(imgDir):
* t H: X U( N) ?2 }/ _: K# k3 B - os.makedirs(imgDir)1 q) g: e) t- V/ |! q- m
- file = imgDir + "/" + file_name
' X- _# k- m& q - # print file* z- k L& G3 I2 i7 I- I( e
- f = open(file, 'wb+')
_1 K9 H+ l0 \6 C+ r# f5 ` - imag = requests.get(Book_img) 8 T" Q2 w' h/ c" ]) d/ X
- f.write(imag.content)8 p7 H. v" s* y1 N
- f.close()- Z: a c6 h( J
- #保存图片前+ k5 Z- \; b! O B. Y; v( q0 H8 N& C- @ Y
- def save_imgs(self,response):
, F; P( M. ]/ s* q. K - content = response.content
) l9 F0 F% A5 L+ Z: n - file_name = response.save["file_name"], L9 V! e- Y4 N# a
- imgDir = response.save["imgDir"]! u( O) b! U6 t( ?
- file_path = imgDir + file_name
. @! A+ l2 r+ ~( Y' G D - self.save_img(content,imgDir,file_path)
# M+ T* M0 d* } - #保存图片2 x; W/ F0 |# ?! J/ I& U1 m2 U
- def save_img(self,content,imgDir,path):( E1 \& P1 [0 o# i& c8 f8 V
- if not os.path.exists(imgDir):
5 B* u! R' P9 u3 z - os.makedirs(imgDir)" X' a# h; _6 T5 I6 u% k. z9 s
- f = open(path,"wb" )
0 C. L V/ S: H, h( J- V - f.write(content)2 J4 q" ?' J! u
- f.close()
M' @4 D2 N' H - #获取url后缀名. C+ b; l3 C z7 [
- def getExtension(self,url): : h* z5 |! ~8 K, R
- extension = url.split(".")[-1]0 h0 E( M% Y7 X0 S
- return extension . p; m8 m' ^- `/ k' T+ Q- ?! e/ p
-
/ p1 Q9 O) x: Y3 {1 y9 j$ R$ R6 J - #获取图片名# l! U" e( ]% U( k
- def getname(self,url):
! O% ], [+ [) U - name=url.split("/")[-1].split(".")[0]( H8 q( z5 R2 Y5 m
- return name
复制代码 ; m; G0 d8 U( \5 w4 v: A C
# {/ i( O: R' V S: W |