Python + pyspider某小说站的爬虫,入数据库,火车头发布,资源下载到本地,另可写爬虫!
5 ^# e0 ^0 h& P+ H6 v; [! f- #!/usr/bin/env python5 o/ z; Y: Q2 N. w ?0 n- m
- # -*- encoding: utf-8 -*-. G/ U6 a- F, ?' G; ]; g
- # Created on 2019-05-05 21:43:112 s4 a/ L) y6 F( X$ @* K4 l
- # Project: XiaoShuo- I7 z# b* X9 m5 D* ?
-
. [9 @( C$ q7 u% } - from pyspider.libs.base_handler import *8 l. g" d8 m x) y0 m
- import pymysql# c- X9 b# \5 [' p. u& z
- import random
1 U, o' _' U% o4 c - import datetime
# K% Q X' R) V3 c+ k0 T$ g. T6 ]) N B - import urllib2,HTMLParser,re
+ C& q; k( a1 e& Y - import os& t, {! B0 @. F& B. C" Z7 ~
- import sys
2 k+ _, Q3 c* m8 N - import re
0 r2 }+ w- \4 i2 k9 _ - import codecs$ S. H% n+ }# @4 g- U4 X; P# ~3 q+ J
- import requests
4 _' c2 L+ R1 t1 P$ L% N - import json- Q3 |0 @5 k" [5 t" {* L; {
-
- Z7 Q I" q1 t' h! I4 M' Y. l - class Handler(BaseHandler):) K- t9 z/ @; n9 M; J5 l
- global Datos
; k ` g. \/ p - global P_dir
% Z+ t) R" ~6 h - P_dir = '/Tools/Debug/' #采集时候图片保持到本地的路径
$ g% Q8 p, {4 \" B: f - global Datos
1 {4 [! y9 d) x' \ - Datos = {}! H- M; X4 _/ v. G* e7 n0 s
- headers= {6 z' t( E, G, [& z% L
- 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',% ?: ^: r' ?2 Y+ k
- 'Accept-Encoding':'gzip, deflate, sdch',4 v* U) y; Z J9 H5 J, ^1 v* y7 T
- 'Accept-Language':'zh-CN,zh;q=0.8',, y) ~* R: k3 A1 w
- 'Cache-Control':'max-age=0',4 K3 R2 t% f8 _9 m, H# p4 U
- 'Connection':'keep-alive',
0 u9 k6 S$ V/ y - 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36') n8 c! ?! n8 F4 A. T
- }
" T8 L2 Y, O% S9 W8 y - crawl_config = {
% ?8 g) F; f O3 v |2 G2 i% L - 'headers' : headers,
, i+ z8 z0 `% F8 t4 I - 'timeout' : 300. T& i3 q* L1 B- Y2 E* }- o
- }
6 V f5 A- |4 q- w& e - def add_BookFile(self,Bookname, BookIDs, img, Locaimg, Book_Dates):7 g7 N$ R5 @0 [4 b/ H+ |4 [1 \
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")( ^/ N3 B- @/ x) @
- try: K. z( W1 [' q) i+ s0 i+ v8 G# ?- ?) y/ Y
- cursor = db.cursor()
' U4 S- N9 |6 A' f% w" o - #注意此处字符串的占位符要加双引号"%s"! [+ B- J; w( M2 `( \ _( }) A
- sql = 'insert into BookFile(Bookname, BookID, img, Locaimg, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, BookIDs, img, Locaimg, Book_Dates);& i9 V& e- o0 x# u {& i9 M
- # print(sql), ~* Y2 m3 w) ^5 A9 k% X* u" j: {# f, x
- cursor.execute(sql)0 o; r3 I' I) n" i$ @, Q- E+ R
- - F7 m2 W$ a0 M2 L/ t f+ ]
- #qid = cursor.lastrowid
) E, Y5 a- \$ |8 o0 e, ]7 @ - #print(qid)0 w T; E0 l6 z" _ q
- 1 j8 h$ K! ~) p: y& h
- db.commit()9 s x3 V. u. S; i# F, v1 o3 T. `
- except Exception as err:
* g5 G/ d W$ J& U% i/ ?9 z - print("Error %s for execute sql: %s" % (err, sql))* F5 P/ g$ T% A* H/ K- h' f7 [ U
- db.rollback()) h4 r5 N' m2 i) n+ H1 i2 h
- def add_comment(self,Bookname, Booktitle, BookID, Titleid, Book_Date):
- I# h% M% a9 M% M! b - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
7 g* [- z* I9 o2 f& G# @# s% R - try:
, m" J. t" H Y' x. J! ~* z) T - cursor = db.cursor()
`. U0 _6 o. G# \+ ^' }/ A3 o - #注意此处字符串的占位符要加双引号"%s"
, M$ W4 K% b( f5 U' t, v$ a - sql = 'insert into BookTitle(Bookname, Booktitle, BookID, Titleid, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, Booktitle, BookID, Titleid, Book_Date);5 [9 X0 c5 u. z8 A
- # print(sql)1 k, U4 ~: q' d* T1 A/ h
- cursor.execute(sql)
5 T5 I: x% o! e! e- F$ P8 m8 @+ U( O0 f - ! b" S3 V; H! B. |) S6 t! W
- #qid = cursor.lastrowid6 F {& A% s% x1 N- x
- #print(qid)
# U2 p5 n6 @9 S& ~' k - + P( B, n. A" u# }0 ~. c X1 w
- db.commit()
# J( y0 w" M; L% _/ }+ X+ ~: ` - except Exception as err:
' u$ c3 [/ t* F0 p3 T: ~' l8 D- z - print("Error %s for execute sql: %s" % (err, sql))
z; F, l7 X) g" @5 T( y7 u - db.rollback()
" V) U2 b0 L6 Q* t' J - def add_question(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date):# B% v3 V. q9 o
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")2 f h$ ?. [7 K; _7 H3 f* _ O3 Y
- try:( J( J/ b; x! p+ A8 P
- cursor = db.cursor()5 |$ y1 f: _+ e( y
- #注意此处字符串的占位符要加双引号"%s"
O1 c$ y; E# H* c/ f. N% o - sql = ' INSERT INTO BookConte (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date);8 U: n, } Z+ }) {4 ?0 ]
- print(sql)4 F# |: C/ ?8 R5 d
- cursor.execute(sql)
6 i% d" [0 l& s! U6 Z5 H, d* C$ T* d' \ - print(cursor.lastrowid)
( G6 @ t& Y* F. r - db.commit()
- C5 b0 q: {+ o' a1 y& j7 B& O - except Exception as err:
8 r2 b8 x1 L0 b% k" m. O' i - # except:# G0 f$ ]9 M* g
- # print('Failed')
9 ]& S4 e3 h% p i - print("Error %s for execute sql: %s" % (err, sql))
0 u3 u. h" p# F% t" f - db.rollback()7 r+ |9 G: Z4 T2 T; N, u
-
`; s% V2 M/ S# Y) B! D4 _ - def add_locoy(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover):
" B, `' W) D2 f6 b' R - reload(sys)6 I5 g) j- @& M0 G, D
- sys.setdefaultencoding("gbk")2 r% Q( ]/ O4 n3 J) t! {9 Q
- locoy_url = 'http://www.******.net/locoy/?my=book' #697火车头发接口地址
0 B8 u' Q& u2 h: x s8 L' z* [ - locoy_data = {
6 T9 e, l4 N. ^; d& |/ G: V, L - 'my_u':'用户名', #后台用户名
& n3 I+ P9 e8 V1 D- J9 h, b - 'my_p':'密码', #后台密码4 J$ P, y" S0 P& B$ j
- 'subject_669977_net':Bookname.encode('gbk', 'ignore'),
$ j$ ]$ z; P( a: r4 u - 'caid':Cater_Name.encode('gbk', 'ignore'),: U5 |7 H' |2 c0 ?9 E! e
- 'title_669977_net':Booktitle.encode('gbk', 'ignore'),( X: G, t% T A+ e2 f, n8 v+ ~
- 'article':BookConte.encode('gbk', 'ignore'),# I6 |: i5 k8 T# \' {
- 'author':Book_author.encode('gbk', 'ignore'),, c1 z" d! T7 K
- 'ready_1':Book_Palabras.encode('gbk', 'ignore'),7 j* \" g9 F- j/ h0 K" _' p# B/ K5 `
- 'thumb':Book_img,
9 w* A0 D: _+ J - 'content':Book_Introduction.encode('gbk', 'ignore'),
) M% S: y3 ?) U" F' [+ P - 'abover':abover.encode('gbk', 'ignore') % [/ i, D9 r1 T: ]- S8 M% E: N" X
- }
5 x6 ^* j1 e/ S" i - res = requests.post(locoy_url, data=locoy_data)6 p Z4 d) s. g! [* q
- print res.text Q, _$ B. s. N3 w5 L
- print res.content
U5 {6 l1 N5 a X. [, R - # print Dsd
; u) ^/ {/ m K+ X+ s* ~9 {- q0 a' n - return res( `' W6 m3 z1 r- H, B2 \3 O0 Z
-
% d7 s5 @0 m/ _: T. E) j: b2 N - def __init__(self):
& i( Y- A2 O! o+ k- l - self.base_url1 = 'https://www.****.cc/': `# q- q$ G8 K' R1 ^6 `
- self.base_url2 = '/'
, Q4 M2 v9 i. q7 q. `! q1 ~" P! ~ - self.CaterId = []
7 G4 w; ~" B# W1 H - self.CaterIds = ['xuanhuan', 'wuxia', 'yanqing', 'lishi', 'kehuan', 'kongbu', 'nvsheng']5 [. u' u4 {8 J+ M% g7 f) k' u
- self.page_num = 1+ n- m, I, F: H7 }2 q
- self.total_num = 200
$ C W$ c* r2 W& z$ \3 D9 e9 q -
! ? R, u6 H' J - @every(minutes=8 * 60)1 v. |( L1 x4 K+ k9 z* V2 b3 S0 h
- def on_start(self):
4 ~5 N5 l% z" m7 m! s - global Cater_Name+ h! |+ x( o1 U
- Cater_Name = [] M3 O T& @' K: ]
- while self.page_num <= self.total_num:
. C) k l. x4 o1 r- W - for self.CaterId in self.CaterIds:" ~3 V% u, [6 U* U. p2 R' H; x
- if self.CaterId == 'xuanhuan':
( ?4 ]' N9 H9 l! c7 i - Cater_Name = '玄幻'
. P1 Q$ K% p2 o+ W - if self.CaterId == 'wuxia':
6 N6 b% L. h7 P9 G - Cater_Name = '武侠'9 V, ?) D8 N% P! z# z
- if self.CaterId == 'lishi':
, q. P) i Z" b" g- S$ L+ D x - Cater_Name = '历史' 8 v6 k1 m& W3 c
- if self.CaterId == 'yanqing':
& `; N; z9 z5 T8 S - Cater_Name = '都市'
+ Y) Z# | d! v2 K. ?; l5 R - if self.CaterId == 'nvsheng':$ H7 w" L# ?, K2 B9 p# P7 F
- Cater_Name = '都市' 0 o/ `% P3 o. ?& P8 d% R3 e, J
- if self.CaterId == 'kehuan':5 f/ t2 `* S5 B9 E8 M
- Cater_Name = '科幻'
3 M' T5 g5 _5 k6 P' e' D - if self.CaterId == 'kongbu':, v) m7 Q' h, o% s
- Cater_Name = '游戏' : o0 B0 r$ N O' \* A
- print self.CaterId$ Z, r" R% ^: O8 w! A. h. P
- url = self.base_url1 + str(self.CaterId) + self.base_url2 + str(self.page_num) + "/" 5 C0 M5 n5 V: S
- self.crawl(url, callback=self.list_Caterg,save=Cater_Name)
- b8 {* s9 x y2 I - self.page_num += 1
8 F% q! B' T D4 ?9 B -
# y6 j1 p) u* p3 b/ v. U - def list_Caterg(self, response):" ~. R4 P$ Y! h1 x! J
- Cater_Name = response.save1 Y6 S, T4 J3 \$ C( r0 z% f8 j! W
- for each in response.doc('.pic-list a[href^="http"]').items():
3 y% g0 F* P2 w' F$ p - self.crawl(each.attr.href, callback=self.list_Caterg_detail,save=Cater_Name)4 P- {2 e2 L7 w8 k2 G' ~! c
- * e6 }, r* d- G o
- def list_Caterg_detail(self, response):
, t/ U$ F; X5 D+ t, K! d2 F4 | - Cater_Name = response.save
. [8 m$ X3 {, y2 W; R - # print Cater_Name
% m! e5 N4 O/ Q3 P, J j - Bookname = response.doc('h1').text() s, p# @' q" _# C, X7 _0 m
- print Bookname
1 Z0 C4 _# F) x: F. ^; ^- n! Y - Book_author = response.doc('.authorname > a').text()
6 E- P! V, p1 x: C: E - # print Book_author
$ y. P0 ?" ]! e, w- u" n% j - Book_Introduction = response.doc('.book-intro > div').text()
4 \* H" d! \7 o+ A - # print Book_Introduction' J: L$ v" v) v. X8 N, X8 T
- Book_Synopsis = response.doc('b').eq(1).text()
% I6 n( [ I0 J( {3 ^ y9 n* Y - # print Book_Synopsis
: R0 D- w, Z. Z+ s. d6 u1 o - Book_Palabras = response.doc('.booktitle p').text().split(' ')[1].split('|')[0]
1 K! \' z5 t2 s6 g - # print Book_Palabras
! T$ @, P% k o5 m* a - BookIDs = response.url.split("xiaoshuo/")[-1].split("/")[0] #小说ID
4 p8 s; `& W& `5 J; x - # print BookIDs
1 ~1 ^% x u# y - Book_Dates = str(datetime.datetime.now())
. t# A! ?4 P: k: G/ c' q- H7 }% P - for imgs in response.doc('.bigpic > img[src^="http"]').items():0 d* f' y0 ?1 M
- img = imgs.attr.src/ O7 q% A8 A( h7 L
- print img4 @' u) U' Z5 _
- #小说封面下载! ?; Y/ u: \! x7 O
- extension = self.getExtension(img)
8 C! F2 k4 G* ^ - name = self.getname(img)+ b& w' D0 s, Z& A- F
- file_name = name + "." + extension8 }1 `7 i: }3 C/ [7 u+ L4 d' t
- imgDir = P_dir + name
* S3 M' E; ]* h - Locaimg = imgDir + "/" + file_name
; `; i) f# J) z - print Locaimg
3 c. ]7 j" p# j) w0 z+ c1 r - if(self.download(P_dir, imgDir, file_name, img)): #这2行可注译,图片下载到本地
0 M/ s; q* g! t - print('attachment url is ' + img) #' J- ?5 @& F) c( K. S7 j2 A. o
- Datos = {
% i, X ~7 O1 i& l; u4 g$ a - "Cater_Name":Cater_Name,: t' Z. I% u9 w! r
- "Book_author":Book_author,& e& i ^, Z5 d. k1 [* S) _
- "Book_Introduction":Book_Introduction,6 p: V- _ C2 I* x8 C" b% q
- "Book_Synopsis":Book_Synopsis,
! u% {( O4 D7 O( ], n - "Book_Palabras":Book_Palabras,
4 C' }7 G* }' o - "img":img,
0 E3 w; N) |8 s - }
# W- p+ N7 S$ A2 L5 ] _+ \ - self.add_BookFile(Bookname, BookIDs, img, Locaimg, Book_Dates) #这行可注译,数据库发布接口,方便其他系统的发布' B8 S( o9 Z5 e- y, V# `) R- H4 ~6 C
- for each in response.doc('div[class="bookbtn-txt"] a[class="catalogbtn"]').items():
s+ ^0 n- @* d0 u( n- H1 A - self.crawl(each.attr.href, callback=self.index_page,save=Datos)
; F' f. A9 b& [/ h7 V1 I - - R0 J7 L; J/ e. C5 e* v' V
- @config(age=8 * 60 * 60) ( G) o' y; R8 {' M3 J7 u4 J* z
- def index_page(self, response): # R) p1 K; ]7 }
- Datos = {( U) Z4 c( W8 C/ k) j8 n/ t1 y2 a
- "Cater_Name":response.save['Cater_Name'],) I* p$ f; _* O
- "Book_author":response.save['Book_author'],. N% ] k) {6 P) _- s6 d
- "Book_Introduction":response.save['Book_Introduction'],+ H- _9 W' }/ l3 }) }9 G# f
- "Book_Synopsis":response.save['Book_Synopsis'],
: R5 n( A. e' q - "Book_Palabras":response.save['Book_Palabras'],
9 q2 j" |! w' G/ G; M, g - "img":response.save['img'],4 J: D# @3 o1 d: K! u A# d2 Q
- }
$ |4 _7 U3 C8 ^8 o* L# M1 G - for each in response.doc('.chapter-list li:first-child a[href^="http"]').items():
6 W* K6 \& z$ ^ - # for each in response.doc('.chapter-list a[href^="http"]').items(): % c& w1 _0 n1 E0 Y8 R$ O8 E
- self.crawl(each.attr.href, callback=self.detail_page,save=Datos)1 u; |, O3 v) T
- @config(priority=2)
6 ~! d0 t: ]2 X K - @catch_status_code_error' Y1 [6 c. }- |+ `% @( |4 W; Q
- def detail_page(self, response):
- n5 M R9 H/ q" p# ~* G5 G - NewRe1 = u'哈书'
/ @ w' u$ l, v" x3 R1 | - NewRe2 = u'huhjsd.CC'$ [8 c# N. j8 I- R) C$ b
- NewRe3 = r'^\\n\\n'& _% y/ F( P' X$ C3 l
- NewRe5 = u'小说网'/ s4 g& Y1 H7 Y( |- \& e
- NewRe6 = u'fgdfgf'
0 l z: ~0 X# }! H+ m - NewRe7 = u'fgfgf'
2 V' U$ j# S7 |$ H0 K - NewRe8 = u'ffhgf'% w6 G5 s% Q* l& O" C: Y8 P
- NewRe4 = r'[\f\t\v+\.\{\(\)\}\!\/_,$%^*(+"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+'
; K2 I+ Q) |; O - ReC1 = u'静思'# e! |: X% z+ d& Q- p: V% ^
- ReC2 = u'aghgf.com'
& t) u7 o$ M' X5 n2 n6 j - ReC3 = u'aghgfh.com'
# r! d0 s5 V f# L. F# c2 o - ReC4 = u''
5 }; m% K& K5 R0 C+ a3 P. n- I - ReC5 = u'文学网'2 G: a3 |9 ]% [: `
- ReC6 = r'<BR>'
' J; w4 h! s0 [5 Y4 M9 P2 z - Bookname = response.doc('.readlocation a').eq(2).text() #小说名称/ j7 D9 G- d. s$ |: ~6 j) m
- print Bookname
1 e- P* _% N2 R& e - Cater_Name = response.save['Cater_Name'] # 小说分类
* P8 q% E. V+ ]" l3 _" g# G: K - Book_author = response.save['Book_author'] #小说作者
' w. G' B; m* g% ^: n# E8 n7 \- s - Book_Introduction1 = response.save['Book_Introduction'] #小说简介4 P/ j2 i( Y' ]# R& v( W
- Book_Synopsis = response.save['Book_Synopsis'] #最近更新
4 X; E0 v5 c" ~ - Book_Palabras = response.save['Book_Palabras'] #小说字数
: n% q1 T4 G8 b9 ?$ w) F. ? - Bookurl = response.url #小说网址, \9 H7 m) K; {0 l2 \
- Booktitle = response.doc('.article-title').text() #章节名称
+ ?7 ]0 ?, D. L3 U/ u3 q - BookID = response.doc('.readset-r span').text() #小说ID- D/ d2 C- W- q5 [% x4 U+ E. f
- BookConte1 = response.doc('.article-con').text() #小说章节内容
5 Y' K0 w) a. a# F4 k - abover = response.doc('.article-title').text() + response.save['Book_Synopsis'] + response.save['Book_Palabras'] + response.save['Book_Introduction'] #小说状态(连载还是完成); U) F: N4 b; {7 x8 W' v
- Book_Date = str(datetime.datetime.now()) # 采集时间7 Z( w% {! V2 \8 |1 I3 |8 A, A
- BookConte2 = BookConte1.replace(NewRe1 , ReC1)
0 K+ I& w6 p3 N+ j9 L) f4 E0 j - BookConte3 = BookConte2.replace(NewRe2 , ReC2)
9 T! v) y5 |2 T3 ]3 f; x; [ - BookConte5 = BookConte3.replace(NewRe5 , ReC5)8 h; g. Q( i0 T- x( ^4 M+ W
- BookConte6 = BookConte5.replace(NewRe6 , ReC2)& H( {* f8 C8 _" Q. Q' M& e0 o
- BookConte7 = BookConte6.replace(NewRe7 , ReC2)
9 R/ Q/ `5 o& U, \5 p2 y" p R - BookConte8 = BookConte7.replace(NewRe3 , ReC6)' v) t" _3 ?# p5 A
- BookConte4 = re.sub(NewRe4 , ReC4 , BookConte8)/ n9 {+ ^6 X# N" ^5 S/ H
- BookConte = BookConte4.replace("\n\n","<br>")
/ k) {+ e2 o/ Y0 ]: z. _8 @! s- F - print BookConte* {( x* g p) c) Q$ D+ @) f
- Book_Introduction2 = Book_Introduction1.replace(NewRe1 , ReC1) Z$ |' }5 h' s. j3 ?* \
- Book_Introduction3 = Book_Introduction2.replace(NewRe2 , ReC2)
4 o0 f. u. M% Q* f) ^$ { - Book_Introduction4 = Book_Introduction3.replace(NewRe3 , ReC3)& Z4 n+ |: d' T% w, |
- Book_Introduction = re.sub(NewRe4 , ReC4 , Book_Introduction4)' z% ^, t4 E( c) L) ?% C3 {! |
- Titleid = response.url.split(BookID + "/")[-1].split("/")[0] 4 ]: e' H7 ]2 x; s* y$ E" h
- Book_img = response.save['img'], #小说图片
7 n' c3 _) \6 i6 u5 [. t - . C, i" t" p% Q% p; ?5 i$ z8 r" z
- #insert into MySQL 小说入库0 \$ g6 r& {/ K+ j: o2 e8 H
- self.add_question(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布: M' \& n4 U+ a
- self.add_comment(Bookname, Booktitle, BookID, Titleid, Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布. m( _8 z8 v4 U& P. T& _9 u
- #post提交发布
: O% E6 H' c) i$ u9 b+ ]% q - self.add_locoy(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover) #这行可注译,火车头发布接口,不需要可取消8 f- Z0 O6 N) Y3 U6 K0 F1 @, N
- Datos = {. `# v; z5 @3 [
- "Cater_Name":response.save['Cater_Name'],
8 q; F; `+ q" m0 o. L2 y - "Book_author":response.save['Book_author'],( |+ x& v9 R9 O' h. Y/ w) F3 d2 |
- "Book_Introduction":response.save['Book_Introduction'],
8 C' p4 G3 u, h* {9 A - "Book_Synopsis":response.save['Book_Synopsis'],
; }$ e Y' e8 S% h - "Book_Palabras":response.save['Book_Palabras'],5 o+ x: d m- \( `( H
- "img":response.save['img'],
3 c& ~) U2 N2 I% K9 K - }
6 p( f, g9 i4 L7 Y5 n1 W - for each in response.doc('.articlebtn > a:nth-child(4)[href*="/xiaoshuo"]').items(): b( J" h, X; _+ V1 }( R" z" i
- self.crawl(each.attr.href, callback=self.detail_page,save=Datos)
_$ u) f4 i8 B. c1 M8 ~ - return {& P; @# T+ H4 [$ N
- "Cater_Name":Cater_Name,
2 u( G9 W D, Z8 T: r; _# T - "Bookname":Bookname,! @. X2 A. [7 ?. F& Y. d4 i
- "Book_author":Book_author,
# y* r: U9 n4 F7 X. N - "Book_Introduction":Book_Introduction,9 g3 T8 @ G4 z* h' u
- "Book_Synopsis":Book_Synopsis,* b' M! G5 T* Z9 ?
- "Book_Palabras":Book_Palabras,+ m# l6 F3 X. @) S. n
- "Book_img":Book_img,
" r1 N7 `/ ^, Z K; T - "Bookurl": response.url,
, [9 h! G" h: C. Q/ L - "Booktitle": Booktitle,* d' O3 [# i- k, X/ R0 A( f
- "BookID": BookID," L/ J* ]9 a4 M+ b- F" s6 Q. B, L
- "BookConte": BookConte,2 }' T: \5 _5 G9 T: X
- "Titleid": Titleid,
, y5 l; k/ [' c2 a: p - "abover":abover,4 ^+ ], J& Y4 j' W. T
- # "Book_Date" = str(datetime.datetime.now()),, Q1 s, J3 n; a: w
- }
: \: |2 U# W/ q& O) f( N4 M - def download(self, P_dir, imgDir, file_name, Book_img):
# [9 a% I! x7 f p8 N, X - if not os.path.exists(imgDir):
0 D4 {7 q$ f& k V7 \ - os.makedirs(imgDir); f7 U+ a5 x# S ^5 H
- file = imgDir + "/" + file_name" |9 ?" C5 c; V! W
- # print file
: @2 j6 w0 r9 a) u3 D5 ?" ]/ O- W - f = open(file, 'wb+')8 L4 Q4 g0 y4 C3 ]
- imag = requests.get(Book_img)
; l8 K6 R# e+ k* h% r$ ?, z7 U9 B3 _ - f.write(imag.content)
* `% V0 n( W% O - f.close(); I) \- D; C! e* x8 H6 t% O
- #保存图片前- p' D: v+ h! i4 k
- def save_imgs(self,response):9 m2 M- ]; d) m& d
- content = response.content9 ?: x& n' J; b- r# Q1 y6 J( s
- file_name = response.save["file_name"]
5 _3 x& Z3 f; I& {! F' A - imgDir = response.save["imgDir"]
[( B9 `0 V1 f3 Y/ T - file_path = imgDir + file_name/ ? t! c: T/ `: Z" A) g& m( r
- self.save_img(content,imgDir,file_path)! }1 s7 V! q* [/ D2 W0 `
- #保存图片
+ }% ~- B# S. t5 }5 F: @ - def save_img(self,content,imgDir,path):# e; [8 p) g, W8 R/ \% Y
- if not os.path.exists(imgDir):
7 `2 K6 t/ ?0 M; H% p - os.makedirs(imgDir)
! H6 k) G" ?4 B' ~4 c - f = open(path,"wb" )7 K8 \' n6 K3 {+ S" u% \
- f.write(content)8 `/ c) {2 e: A% x% z h
- f.close()2 U8 ?) P' }+ h5 \+ W! \) ^* `* D* n/ ~
- #获取url后缀名
5 [% j, V0 k, b Z - def getExtension(self,url): % K7 S7 x$ t ]! `
- extension = url.split(".")[-1]+ e! E7 ? h- F' n h7 }
- return extension
- b' E7 V' j; F/ @8 M -
6 h- M Q' P6 X5 d - #获取图片名
; t4 o1 H" M( D w' V) f O: o7 b - def getname(self,url):
4 f" {% J# b _3 q - name=url.split("/")[-1].split(".")[0]% a& l, n R! g1 S1 D( p
- return name
复制代码 0 y5 {" g5 e1 y: m
1 X9 Y! E2 K2 |3 J& X& [5 A |