Python + pyspider某小说站的爬虫,入数据库,火车头发布,资源下载到本地,另可写爬虫!
/ @$ f4 ^3 x4 x6 d# q F4 d7 O- #!/usr/bin/env python5 e( H+ z# X: F! i1 u( u# g
- # -*- encoding: utf-8 -*-
* V$ R# ~* X; w8 w: ~( h7 Q0 j; T - # Created on 2019-05-05 21:43:11
, C. l$ e! e- N3 O! n - # Project: XiaoShuo
7 t& T8 k, G( t: |$ O - # A S5 N' r% I Q
- from pyspider.libs.base_handler import ** w4 @5 W# n; i# { D) }
- import pymysql
8 n! A5 k1 B: u/ t - import random
0 l- Z$ H. W/ F! E( b: L, M - import datetime$ ~% K6 P. M, `* A3 t. J
- import urllib2,HTMLParser,re4 W& F: z- J# k: P% Y7 Q: T
- import os- I0 u+ ~9 @) H: n+ A/ u
- import sys. O, V6 a# ^3 a+ A$ \. I- q
- import re3 C! {7 T) I6 H( G+ k$ j
- import codecs
/ z% @3 D% z2 h/ {+ ^$ y - import requests
( W& h! ]' z* L9 w - import json7 |: H# A. r! w5 X, T# B
-
. ~& {$ \) V. I ^ - class Handler(BaseHandler):# J0 j- T8 P, v& i
- global Datos
# P* W" [ h8 \2 [ - global P_dir
# \ l! [+ _2 L1 J9 d( ~! X - P_dir = '/Tools/Debug/' #采集时候图片保持到本地的路径$ T# M$ b: W! j7 o4 F8 z- K9 b
- global Datos
4 w# Z$ {6 p' s& Q6 F& n# a* A* a - Datos = {}; @; ^$ c& K* Y% N2 D2 e
- headers= {
* A: Q& `6 V% X0 f - 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
5 y% ]* d" D& A5 D - 'Accept-Encoding':'gzip, deflate, sdch',+ v- y4 B0 W! a$ X' f0 c+ m2 m/ z
- 'Accept-Language':'zh-CN,zh;q=0.8',
/ m. w; b4 {" N3 L( g( x { - 'Cache-Control':'max-age=0',7 G1 d/ O2 e8 z% S c
- 'Connection':'keep-alive',9 P+ K- \6 U$ T% \* x! W' p4 {+ I
- 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'
6 d* X; b7 _7 E - }
2 W. [. H+ p/ o( I - crawl_config = {
: @" N5 @2 K; g2 o! p1 o+ d - 'headers' : headers,# T6 {( ]! F0 F0 R* l
- 'timeout' : 300
2 y1 r+ b+ {' T! M G. R: V L7 k- [ - }; n2 p" [( ]* Z! R. ^' T' t) ?
- def add_BookFile(self,Bookname, BookIDs, img, Locaimg, Book_Dates):0 U8 U8 p$ n. d2 ?5 q0 v
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
0 k8 n& M; E, n" C2 i6 w - try:) a8 t2 @# }/ f3 c
- cursor = db.cursor()
1 _+ X6 q% W6 K7 W6 Z: B! q - #注意此处字符串的占位符要加双引号"%s"
$ R, z- F# q8 w2 J. w - sql = 'insert into BookFile(Bookname, BookID, img, Locaimg, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, BookIDs, img, Locaimg, Book_Dates);
2 R) |. h3 T8 e: i - # print(sql)
' N3 o1 j1 \+ n5 J+ [6 d* A - cursor.execute(sql)
- N) A7 N/ L2 _/ @6 R* T% Q6 c -
: ?/ y7 \( Z9 d8 ]& c$ p1 F6 T - #qid = cursor.lastrowid8 j/ K5 ^ [' _, X i' S
- #print(qid)
2 g% {7 D9 R; A, @' t1 Z -
; |) r$ A. X) s4 I - db.commit()
1 F+ }. Q5 H1 U( y Z) V( }, u - except Exception as err:& o; H! _' }8 B: }) ^; a& ?: V# B
- print("Error %s for execute sql: %s" % (err, sql))
% `5 l- V3 |& C5 Y" | - db.rollback()" {' U/ M5 _ ]) q
- def add_comment(self,Bookname, Booktitle, BookID, Titleid, Book_Date):( \2 A/ `3 P3 K4 Y" N2 \/ r: n
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8"). v: x9 V9 x6 y, v. h1 q& @
- try:
; M% f* `3 n3 i# ^& u2 j5 @ - cursor = db.cursor()
7 [3 F2 A! X7 M7 R! H1 Y - #注意此处字符串的占位符要加双引号"%s"
+ A9 K9 u" d8 K9 v, g5 h! [$ o - sql = 'insert into BookTitle(Bookname, Booktitle, BookID, Titleid, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, Booktitle, BookID, Titleid, Book_Date);
" Y, y" z9 V) E" I - # print(sql)1 n# A. h7 l, ~# s4 r
- cursor.execute(sql)
& k5 C9 G* w' H; a8 _* v- M- ` -
: \$ V& j& e& a X$ `" i3 I - #qid = cursor.lastrowid1 r' @1 G2 |, G7 i) _& x- d' c
- #print(qid)
8 h& b$ A) A& J! }9 n% @5 P -
! N/ E- j0 B8 D" E8 T- h6 E - db.commit()
5 L D1 p) Y0 W* X - except Exception as err:7 {, ]/ `6 e- v$ I- J
- print("Error %s for execute sql: %s" % (err, sql))9 D5 s; q' w6 e) L' [! L
- db.rollback()& B. _3 Q, o" c; b
- def add_question(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date):' R( y( r6 n0 ]) C7 _5 P8 \! y9 L4 s$ e
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
7 z5 q8 W5 X, X - try:5 k3 ]4 H. @, h; ^- `+ ^- G% B1 ~% |% S
- cursor = db.cursor()& ]; p/ q/ t3 P# x" G1 E
- #注意此处字符串的占位符要加双引号"%s"
4 ?1 i5 e+ P& N6 V( G - sql = ' INSERT INTO BookConte (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date);
1 U `3 x, Y+ z$ S - print(sql)) T" @% R: a8 `; w, u
- cursor.execute(sql)7 A1 m, N/ V1 K) d- b/ z. z& t# V9 z
- print(cursor.lastrowid)( v3 Y$ Z* O) I+ }4 ~5 w. Z
- db.commit()+ o6 r7 k" n( ]8 Z. D8 F# Q$ _- v5 U
- except Exception as err:
% L4 _! d+ b9 w - # except:
' F2 e% _! z \6 J; J6 ~ - # print('Failed')
0 j: i1 K/ I3 ]* t+ @* c - print("Error %s for execute sql: %s" % (err, sql))
. Y1 B' b( Q8 A) R9 ?; M' u - db.rollback(). \7 r L1 }- X& o O
-
6 {; ^2 o% ~ D& e - def add_locoy(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover): ' N5 Q5 Z6 B4 e) @8 D; E! [) X3 |/ [
- reload(sys)
) j7 T9 ^/ {4 S9 c: b - sys.setdefaultencoding("gbk")
" s$ g. [ M6 m" v. E+ s. U: V - locoy_url = 'http://www.******.net/locoy/?my=book' #697火车头发接口地址
; e* b6 k/ K* g9 }7 u - locoy_data = {
; s+ P, x; M5 f9 N - 'my_u':'用户名', #后台用户名
8 Z- f i; h+ {1 Q% G7 u7 y, W - 'my_p':'密码', #后台密码0 ^8 J Y, @5 d8 o1 Q: a: s
- 'subject_669977_net':Bookname.encode('gbk', 'ignore'),
# P& E3 `, s8 r: {; T; W7 } - 'caid':Cater_Name.encode('gbk', 'ignore'),4 q5 \: A5 `: q" U$ o. \+ R
- 'title_669977_net':Booktitle.encode('gbk', 'ignore'),2 y8 G$ I7 ~* T% j- h
- 'article':BookConte.encode('gbk', 'ignore'),
1 J, }1 Y% k7 k, m2 K5 v" n- P - 'author':Book_author.encode('gbk', 'ignore'),
- q; a% E0 w4 Q) C1 d9 `, D - 'ready_1':Book_Palabras.encode('gbk', 'ignore'),6 H1 c3 W: u: Q/ a# G) y, Q! y
- 'thumb':Book_img,: \9 T" S# s9 y* ?, O6 A& T
- 'content':Book_Introduction.encode('gbk', 'ignore')," K% d# r3 g8 \
- 'abover':abover.encode('gbk', 'ignore') ( ^$ y! g8 p# T# W
- }
3 J" T( r/ s K% R" y" I - res = requests.post(locoy_url, data=locoy_data)# `; x! |" ?2 t# h
- print res.text
& N9 W: O4 j5 N5 h/ O |5 J - print res.content
+ G. \8 u9 l9 i2 F) k4 V1 s - # print Dsd: i! e! u3 x$ A" E5 ?
- return res3 g/ G( r, F1 h% T+ q% e' v
- 5 l+ W$ z# q5 O/ K% k
- def __init__(self):
9 o# a* [: L+ P+ P2 o1 W3 y - self.base_url1 = 'https://www.****.cc/'
5 x H$ E B( ] - self.base_url2 = '/'
/ C) t+ u: x6 V+ K. t- x2 q. d; l& E - self.CaterId = []
; f3 L" j v1 q1 x2 s - self.CaterIds = ['xuanhuan', 'wuxia', 'yanqing', 'lishi', 'kehuan', 'kongbu', 'nvsheng']
% [" i% B, S; X' M; K - self.page_num = 12 f( B, L5 N1 ]; F
- self.total_num = 200 # A9 f( j6 }4 o% m# P9 A% u3 H
-
9 d, Q2 w' S- B% H) G) ~" m7 x1 o" E - @every(minutes=8 * 60)
/ d/ g5 o" T! @; ?' p7 g& H - def on_start(self):/ |) L! c- S3 ?; M+ {4 d
- global Cater_Name
1 l. ]3 I- t u& g# {- h/ a9 R - Cater_Name = []+ `: [0 m {# C& O* z" o4 A
- while self.page_num <= self.total_num: 7 C! g4 W4 `) V7 W8 i) w
- for self.CaterId in self.CaterIds:
) B; r! @# ]& U1 l6 v - if self.CaterId == 'xuanhuan':7 p: d3 G6 {9 {5 f' {: Y+ {
- Cater_Name = '玄幻'
7 d- B; K0 N/ x6 ~$ J' q) J9 J - if self.CaterId == 'wuxia':
# G3 B/ }7 A6 \9 R" _: q) u - Cater_Name = '武侠'
/ [8 ?) k- G6 n2 `1 m$ Z - if self.CaterId == 'lishi':& ^% G3 [" U0 q K4 ?: r, ?
- Cater_Name = '历史'
: c, D# j' E* A/ q! p! p - if self.CaterId == 'yanqing':) c& }' N, Y; ?* |0 b
- Cater_Name = '都市'
2 D2 b0 P' U4 W5 Q" e4 r - if self.CaterId == 'nvsheng':" ~0 y7 b4 Y8 s1 R9 J
- Cater_Name = '都市'
' E, v3 F- \: M$ y& j+ w0 ] - if self.CaterId == 'kehuan':
6 @" s( f. t( l: ^) c - Cater_Name = '科幻'
5 E3 o o' w2 S' \% { - if self.CaterId == 'kongbu':: x" J. m4 F# H) T$ C' A: J/ ?
- Cater_Name = '游戏'
. x: h- l: e/ | - print self.CaterId
9 q# Z4 ~) y' [ - url = self.base_url1 + str(self.CaterId) + self.base_url2 + str(self.page_num) + "/" # ]# l$ t- F$ g0 `$ X8 e
- self.crawl(url, callback=self.list_Caterg,save=Cater_Name)
5 e+ j% ?3 B3 @) w - self.page_num += 1 3 r c" @$ Y! u4 g1 i9 X
-
- }) D7 c" q. U8 H: I - def list_Caterg(self, response):
4 S7 M i" m8 ]0 u6 Q; C& K1 | - Cater_Name = response.save
1 T' u; S/ P& n" b& w9 ]/ b9 j* M - for each in response.doc('.pic-list a[href^="http"]').items():
5 a& G5 R9 z, d( M$ ]% G - self.crawl(each.attr.href, callback=self.list_Caterg_detail,save=Cater_Name)
: u0 ]: {/ h& c" j -
& C. M( b. o- _0 c4 g - def list_Caterg_detail(self, response):
% h' M) g; _# [, t - Cater_Name = response.save
$ L# v- E" h" [1 Z - # print Cater_Name
) x( S% k) f; I& I4 N& F: \ - Bookname = response.doc('h1').text()2 C, u d/ _, Z+ V3 Y; t
- print Bookname
X' {" m z8 H) |( ?# C/ R - Book_author = response.doc('.authorname > a').text()) [/ t6 c: Y5 Z1 t
- # print Book_author
/ W6 D; V" t- Q. S- A+ {) \0 ]' w7 n - Book_Introduction = response.doc('.book-intro > div').text()& F, C2 ]- N" e/ H; ]+ @5 C
- # print Book_Introduction
2 {0 R6 _4 i0 [6 v' B1 I+ B+ v - Book_Synopsis = response.doc('b').eq(1).text()1 y, L, G" D+ z0 c: n" V
- # print Book_Synopsis- P0 F8 J4 l! ]# r( P
- Book_Palabras = response.doc('.booktitle p').text().split(' ')[1].split('|')[0]
* V* w. \0 b6 T. B; o; a) `; |) } - # print Book_Palabras
& y2 y! M* W- P, c - BookIDs = response.url.split("xiaoshuo/")[-1].split("/")[0] #小说ID- r1 o* l O* z$ q; Q K4 m
- # print BookIDs
. b- k! Y4 q5 G9 G) G+ U% p+ i - Book_Dates = str(datetime.datetime.now())
4 r4 o7 |7 I/ I - for imgs in response.doc('.bigpic > img[src^="http"]').items():
/ z7 z( L7 u9 C% k - img = imgs.attr.src
" b5 p( B" o1 R' v2 u - print img
) ^+ p+ I3 _4 W6 q" U8 E" t - #小说封面下载' E* U$ r) K3 _) w& L2 x* Z& a
- extension = self.getExtension(img)
" K- c3 h; x: q2 M Y - name = self.getname(img)2 K' y9 W/ S" M* J5 `. a
- file_name = name + "." + extension/ ^$ S/ g' Q1 H( N5 U1 u; D
- imgDir = P_dir + name
2 z5 @4 j/ Q# C" D* \3 \% B - Locaimg = imgDir + "/" + file_name
9 |/ y' \3 }% o/ ] - print Locaimg
: O: r: i4 ?! J* ? - if(self.download(P_dir, imgDir, file_name, img)): #这2行可注译,图片下载到本地
/ E# n `" z) \ - print('attachment url is ' + img) #
* ~. q( A X( W. l' o: |+ c& f - Datos = {
. u8 h5 J+ T/ p, l& e - "Cater_Name":Cater_Name,% r- w4 M, G; X4 |3 ]- s0 O7 M
- "Book_author":Book_author,& H' Q9 Y1 L' f B# }1 ~, z
- "Book_Introduction":Book_Introduction,1 c! \! U z; i1 T" e; G( Z% W' E5 |
- "Book_Synopsis":Book_Synopsis,
% I- v$ o( T. _4 X9 w3 u0 S) y - "Book_Palabras":Book_Palabras,
. U9 h! d& ~9 U$ Q2 E. D - "img":img,* p9 ]8 Y" X K" g0 Q, B2 m4 @4 _
- }
( O4 `# {/ ~4 n0 N - self.add_BookFile(Bookname, BookIDs, img, Locaimg, Book_Dates) #这行可注译,数据库发布接口,方便其他系统的发布1 x' { M. W) |" W6 o6 `
- for each in response.doc('div[class="bookbtn-txt"] a[class="catalogbtn"]').items():% w4 Q1 F: v0 n9 N# ^
- self.crawl(each.attr.href, callback=self.index_page,save=Datos), B# X( q9 \7 h/ I# [' B* n4 ]
- , V) L# d! \9 }
- @config(age=8 * 60 * 60)
1 s4 x% U- D1 q, \( h/ W0 E - def index_page(self, response):
0 r' _/ e% x/ E5 ^% T* R - Datos = {
( G8 _1 U x$ K - "Cater_Name":response.save['Cater_Name'],
: x2 r. `" g. d+ `3 L - "Book_author":response.save['Book_author'],6 ]. j* S( e4 ]8 l
- "Book_Introduction":response.save['Book_Introduction'],* ?( r4 |, M6 F' |0 H; o
- "Book_Synopsis":response.save['Book_Synopsis'],
; o8 D5 G1 G& j8 j0 R - "Book_Palabras":response.save['Book_Palabras'],
% u j5 L7 f" V! U/ ], G( U - "img":response.save['img'],9 r$ D8 l+ e% ^, k# x0 H
- }
& R. M2 i% ^1 w" b- v4 x - for each in response.doc('.chapter-list li:first-child a[href^="http"]').items():8 Q6 R/ y L O8 b: ]2 _
- # for each in response.doc('.chapter-list a[href^="http"]').items(): 7 c* U! [% \: C8 Q
- self.crawl(each.attr.href, callback=self.detail_page,save=Datos)" `* U9 N8 ^, o% b) R3 V
- @config(priority=2)! J; X* W @+ v5 G
- @catch_status_code_error0 L* T2 g, c) V! p( D
- def detail_page(self, response):
8 P/ `5 Q4 |/ A- _2 _5 L+ C" U5 Z - NewRe1 = u'哈书'
# `0 o, j. D, Z - NewRe2 = u'huhjsd.CC'! V; w ]: }; R4 J9 x0 Z
- NewRe3 = r'^\\n\\n'/ J+ G9 Y, O( c* O
- NewRe5 = u'小说网'
; I2 j6 M# I N& X" T) o1 ` - NewRe6 = u'fgdfgf'
" P O% {% x. m: ]9 c, j) \ - NewRe7 = u'fgfgf'" i$ g; n: U. e4 f3 }9 G: n/ ]
- NewRe8 = u'ffhgf'4 h3 n5 a) f) `. V" F
- NewRe4 = r'[\f\t\v+\.\{\(\)\}\!\/_,$%^*(+"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+': l9 `8 l0 L Q$ l
- ReC1 = u'静思'8 N5 } h- v% u. [+ H
- ReC2 = u'aghgf.com') J! `- b6 M5 ]7 F5 G9 c
- ReC3 = u'aghgfh.com'' T5 W1 I" p4 @6 @9 b- P) k
- ReC4 = u'') [# q' {. q& Z1 c* U2 m" V! P$ Q
- ReC5 = u'文学网'
' l/ I( t, s# C) |0 \+ W) T - ReC6 = r'<BR>'' b Y# Y9 U, [0 V* v" D
- Bookname = response.doc('.readlocation a').eq(2).text() #小说名称3 Y' e, ^" x5 O
- print Bookname/ i% d! v) d/ @
- Cater_Name = response.save['Cater_Name'] # 小说分类
: J4 i9 A# L9 i N! u" m - Book_author = response.save['Book_author'] #小说作者. K1 q0 o6 o1 i9 r
- Book_Introduction1 = response.save['Book_Introduction'] #小说简介
: ^- Z/ s3 j; C - Book_Synopsis = response.save['Book_Synopsis'] #最近更新% I$ m6 ?- J) p) ?2 \0 [
- Book_Palabras = response.save['Book_Palabras'] #小说字数. [5 V/ g8 E+ M; ~0 S
- Bookurl = response.url #小说网址& v: N/ G# a: T' n) ]3 m5 L
- Booktitle = response.doc('.article-title').text() #章节名称
! i- E& j/ F& w4 M - BookID = response.doc('.readset-r span').text() #小说ID. c5 T. y, ^& A4 Y9 H
- BookConte1 = response.doc('.article-con').text() #小说章节内容' ?4 L: g3 V7 X8 m# {( b9 n; N
- abover = response.doc('.article-title').text() + response.save['Book_Synopsis'] + response.save['Book_Palabras'] + response.save['Book_Introduction'] #小说状态(连载还是完成). ^3 M. q/ @" O V ~9 n7 Y
- Book_Date = str(datetime.datetime.now()) # 采集时间" l9 ]8 v# u$ f
- BookConte2 = BookConte1.replace(NewRe1 , ReC1)
" y( ^& x- L) I- B0 \. b( w - BookConte3 = BookConte2.replace(NewRe2 , ReC2)/ E3 L- L- o$ Z
- BookConte5 = BookConte3.replace(NewRe5 , ReC5)
7 t: x) K( Q, a/ g! u - BookConte6 = BookConte5.replace(NewRe6 , ReC2)4 S2 s3 p- l( I( q' J# w
- BookConte7 = BookConte6.replace(NewRe7 , ReC2)
* P. W" J) {1 l. a - BookConte8 = BookConte7.replace(NewRe3 , ReC6)
9 V& @" m1 `" U2 J `% D5 g5 a - BookConte4 = re.sub(NewRe4 , ReC4 , BookConte8)( {! M: r4 ^) d% k) j% h' M
- BookConte = BookConte4.replace("\n\n","<br>")3 ?* ]' M8 \% m+ X! o
- print BookConte
& S' Q8 I R' [, W3 c. G - Book_Introduction2 = Book_Introduction1.replace(NewRe1 , ReC1)
# ^4 ~2 l& s0 o) h7 }4 S7 x( T# A8 Y" V. j# ~ - Book_Introduction3 = Book_Introduction2.replace(NewRe2 , ReC2)( E7 M# a9 Q/ S- B) j: Y
- Book_Introduction4 = Book_Introduction3.replace(NewRe3 , ReC3)
. I# y# E- W* N+ n6 h% S7 { - Book_Introduction = re.sub(NewRe4 , ReC4 , Book_Introduction4), d& W% a# g% p! c4 A" J
- Titleid = response.url.split(BookID + "/")[-1].split("/")[0]
" x! L# C5 Z% f; s7 x - Book_img = response.save['img'], #小说图片6 c$ L. a6 x1 j: ?4 \
- 8 i \ o( W. K, |) U
- #insert into MySQL 小说入库
l5 S5 ~+ c. Z, Z" Y - self.add_question(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布- W$ C/ B. d @( l. B. Z
- self.add_comment(Bookname, Booktitle, BookID, Titleid, Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布3 j# ^! u* f+ y) g8 p( R
- #post提交发布
% @$ g0 w- }/ Y. U0 C - self.add_locoy(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover) #这行可注译,火车头发布接口,不需要可取消
$ m3 U1 l& ?' v7 s7 z* i" \ - Datos = {
0 k! U) A$ `9 N x# {! m: y W - "Cater_Name":response.save['Cater_Name'],9 e) N8 j: m! }1 d
- "Book_author":response.save['Book_author'],
; k* R) A D) ?6 d$ I" v - "Book_Introduction":response.save['Book_Introduction'],' k( o& ` i5 w+ J7 B! `* j
- "Book_Synopsis":response.save['Book_Synopsis'],& \& p- N/ [4 A9 U' r
- "Book_Palabras":response.save['Book_Palabras'],/ j4 C F, l, R3 i/ f# Y! o
- "img":response.save['img'],
+ a$ P8 E9 O$ O* s; Y0 [& ^ - }4 E, Q- J; l) Y0 R, A2 t
- for each in response.doc('.articlebtn > a:nth-child(4)[href*="/xiaoshuo"]').items():
7 N/ u7 S, d% |; K2 T - self.crawl(each.attr.href, callback=self.detail_page,save=Datos)
' Q/ T' l! f5 S$ G0 {) [7 `. z - return {2 G4 x& E B5 C; }: P: M9 y+ X
- "Cater_Name":Cater_Name,$ I6 H$ w; c1 q. q1 o4 N' U3 X7 x) w3 [
- "Bookname":Bookname,
8 N% o2 {% w: u9 p6 d! R - "Book_author":Book_author,
3 W6 u+ N' h: ^1 R4 ~ - "Book_Introduction":Book_Introduction,# i$ k+ _% j3 J+ z
- "Book_Synopsis":Book_Synopsis,; J1 u- r8 y; p1 U9 t4 r, T
- "Book_Palabras":Book_Palabras," t# l0 S6 ?6 _) s/ M& w
- "Book_img":Book_img,
; q( C1 \& m6 w3 @' w - "Bookurl": response.url,$ S# I5 j5 t( s' @
- "Booktitle": Booktitle,
3 R4 i1 x" C: t - "BookID": BookID,7 H4 e/ c9 j5 }% [6 n) Y9 ]$ h! k
- "BookConte": BookConte,4 ~# k! q9 @ M. f8 r% S1 O
- "Titleid": Titleid,
. q0 O- a( `: k* d; l1 j8 c - "abover":abover,
) P- `: u$ o. }) ~* b - # "Book_Date" = str(datetime.datetime.now()),
* g/ }4 I3 T1 }5 j; u - }
- B5 P7 F1 |) U8 n7 ~ - def download(self, P_dir, imgDir, file_name, Book_img):6 p( j4 [+ `( C O+ n. d
- if not os.path.exists(imgDir): ; A: ^# k5 x0 K! O6 ^
- os.makedirs(imgDir)
5 Z: t" n9 w0 f# n* u7 G5 S7 }9 r - file = imgDir + "/" + file_name( @& ]) I9 j- m, z1 c' K6 b
- # print file
; [6 ?: L9 b; u- v1 h# \, i - f = open(file, 'wb+')
! ?1 Z6 j7 v0 j2 q3 R( q6 l+ x3 ]( \ - imag = requests.get(Book_img) 1 L( ]7 i) M t; K( M
- f.write(imag.content)8 t# r; x7 Z) q% `6 @5 ]
- f.close()' x" E& Z3 Q+ l9 v; R% M' y3 t: [
- #保存图片前6 j* \- f- x: B, S5 \% m
- def save_imgs(self,response):0 O8 O# V, g2 W5 {% G" I2 Y
- content = response.content* Y6 c" b, J* o
- file_name = response.save["file_name"]
) \# B! X1 ]/ J; h' v$ f7 R - imgDir = response.save["imgDir"]+ V2 v' G0 H v( y, D1 c+ o
- file_path = imgDir + file_name2 b7 H$ c7 F! W
- self.save_img(content,imgDir,file_path)+ Z5 N* ?( }* k$ C
- #保存图片
2 o- W$ {2 h+ c( ?3 _$ x, h$ l - def save_img(self,content,imgDir,path):1 z0 G& t8 P! I7 c d2 s
- if not os.path.exists(imgDir): % e$ c4 k5 e1 `1 V, z) p$ e
- os.makedirs(imgDir)
' p c$ F, v5 E) k2 @, r g - f = open(path,"wb" )
" S8 Q' s7 F2 N" C5 S - f.write(content)1 ]; }* S$ e5 @" Q$ F: Y8 d h% e
- f.close()3 u* J e" i& o7 E
- #获取url后缀名1 I T( h7 a" o0 @+ x
- def getExtension(self,url): % k& i5 V, x! Z
- extension = url.split(".")[-1]
' P0 r0 e0 h4 q) i1 S - return extension * L3 P B* m1 h0 y4 A$ ^
-
p* t" J$ G9 S' J# `5 Y6 Y4 O - #获取图片名
: W0 K x$ T7 V: X5 [$ j% m' l- G - def getname(self,url):
$ i& `2 Z+ A! j7 n/ B9 p - name=url.split("/")[-1].split(".")[0]
+ C& J/ Q0 a; b - return name
复制代码 " M' ?4 F+ O* @$ k( b) s: r
, [0 M2 B' Q8 Z, C |