Python + pyspider某小说站的爬虫,入数据库,火车头发布,资源下载到本地,另可写爬虫!7 S( Y1 b5 m4 j) W
- #!/usr/bin/env python
8 o! S- T" y( S9 H) H - # -*- encoding: utf-8 -*-1 `" y4 x) l+ M% Z! d8 b; B! X: Q
- # Created on 2019-05-05 21:43:11, g0 j8 ^# v, w' o. V
- # Project: XiaoShuo
* z5 N: W- {6 _. x6 `, w; n# m* h% X -
( G$ w8 R( p& N5 C0 X/ q9 B6 e7 C - from pyspider.libs.base_handler import *
8 W: d. p( F% R* v - import pymysql! e0 ]% Q# D1 N7 R
- import random) H. e& c# ]6 R* @" i
- import datetime
! j5 R6 R8 }' D$ K8 R8 i# f - import urllib2,HTMLParser,re7 ?3 K# C& f6 [$ ]6 |: c2 W
- import os$ h3 C, ~% D2 E i# G
- import sys
$ d6 f, A+ C7 U3 l% U- T" s3 x8 i& J - import re
* v+ X0 `( I' j& U- k3 x4 O/ S - import codecs. l( k& z1 R) n6 j% k
- import requests& J/ l% o k5 T; F3 S! H8 T. D: g
- import json
5 K# M. G6 n. o% a -
8 G; L: H: }- R; `/ L# H3 X' w9 V - class Handler(BaseHandler):
* u- m" C4 Y& Y" ~0 ` - global Datos( y7 V" D: ]! Q
- global P_dir 1 J: N* L6 O# E- w1 f3 D3 y e
- P_dir = '/Tools/Debug/' #采集时候图片保持到本地的路径& x6 s5 K3 O1 A: i
- global Datos
% ~8 x* F! I3 W' _ f - Datos = {}
" P/ V6 w! Q3 K: N2 q, h; b) y V- C - headers= {" h2 Y. {2 Y$ U' g- C
- 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', L, y2 g! Z' u% J' s" n
- 'Accept-Encoding':'gzip, deflate, sdch',
, w6 J1 ?( e5 z* ` - 'Accept-Language':'zh-CN,zh;q=0.8',
* x1 N0 F5 F) g2 u9 }' A - 'Cache-Control':'max-age=0',
2 K' B a7 }( A3 a6 ? - 'Connection':'keep-alive', [% v$ S9 |& A) k! W) E7 [8 [3 A4 y
- 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'
2 p6 }3 m- m% N! V( E; D1 f - }4 }0 R# g ?" c! ?
- crawl_config = {
/ @" f' b9 N6 W& A5 z2 _9 Z - 'headers' : headers,+ v6 V* ]6 u( s7 g$ c
- 'timeout' : 300% |& y5 Z6 u- x3 z" {6 r# l
- }
- `( ?7 t: R5 V) {) C - def add_BookFile(self,Bookname, BookIDs, img, Locaimg, Book_Dates):
* M+ K: Y* m: F - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8"). A0 ?* g0 }6 l2 c
- try:6 l) d& M& ^. y+ e0 V7 L
- cursor = db.cursor(). x X; N- f; x1 A: w
- #注意此处字符串的占位符要加双引号"%s"9 B+ ~, j4 `0 N: b3 _! o5 N
- sql = 'insert into BookFile(Bookname, BookID, img, Locaimg, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, BookIDs, img, Locaimg, Book_Dates);& j$ m% W3 C7 J: B M) X
- # print(sql)
" L/ W- w7 H; k c$ e - cursor.execute(sql)8 }0 ]2 B, }, Q
- & \8 F, o* p7 L8 o$ z) x
- #qid = cursor.lastrowid5 a n& ]- D1 Q$ b/ Y' U9 u3 [
- #print(qid); M/ ~8 o S' L3 i8 M. S, m: x
-
& o) h5 ~. |' k - db.commit()
. |' Y5 |1 c/ I2 e: B$ o - except Exception as err:
% P5 [; {+ S2 N7 B$ ^1 i - print("Error %s for execute sql: %s" % (err, sql))
' T: o( y6 N% C6 y - db.rollback()
t5 s4 n0 {( N2 y% D5 J - def add_comment(self,Bookname, Booktitle, BookID, Titleid, Book_Date):$ ~+ t% \1 A* z2 c2 R
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
& F, G1 S0 K1 Q, N3 Z) f( X - try:
, l7 q0 O' h1 A. S! D N% Z - cursor = db.cursor()
, z4 W' z$ W8 f, W - #注意此处字符串的占位符要加双引号"%s"2 F# |; c( I' C& _9 G# \
- sql = 'insert into BookTitle(Bookname, Booktitle, BookID, Titleid, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, Booktitle, BookID, Titleid, Book_Date);
) Y9 @1 R6 H' B& U$ H. ]6 o$ [ - # print(sql)
- R3 O5 w4 ] |6 d" j* ` - cursor.execute(sql)5 P/ w& ?0 t( P, R# T! o
- 0 q+ w4 U: G( b& Q+ ?2 _, {5 [
- #qid = cursor.lastrowid
5 m* K/ M" C4 X1 S5 _/ K - #print(qid)
* l( G3 S3 k" [( ^ -
$ Q: ?9 i( I$ ~2 w9 F5 B. l - db.commit()3 K# @8 D" k! K7 S$ X
- except Exception as err:
( u, N3 q1 R: s; o- Z1 ^: M \: t - print("Error %s for execute sql: %s" % (err, sql))/ c* h; U6 b" {5 f- _
- db.rollback()
4 N: X& `# i1 I1 F - def add_question(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date):
) u9 M! X/ k. J1 H7 J - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
: ~+ R# |2 r' r9 D# k - try:; D6 V2 O0 z8 k5 }' f6 \
- cursor = db.cursor()
! J" m* a6 ~. W' A! R - #注意此处字符串的占位符要加双引号"%s"
& D& v& h9 \4 A8 l3 V! ] - sql = ' INSERT INTO BookConte (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date);1 H% H* D2 R7 s( E! J0 ?3 B
- print(sql)1 F$ Y! _* `; s1 H$ J! @2 I- A7 N
- cursor.execute(sql)5 v" \4 T6 x: \' |+ W
- print(cursor.lastrowid)
9 Y1 L% w; k4 C1 h9 ~8 @ - db.commit()4 x9 J% Q) @7 E) R7 W8 W8 F O
- except Exception as err:' ?- |6 C3 t6 E! U; |! n7 j0 h/ I
- # except:
2 p" \. S: f# o! l" o - # print('Failed')% d0 u1 ?4 T% m6 _6 o
- print("Error %s for execute sql: %s" % (err, sql))
2 Z9 c( b; k6 l3 P+ [8 r: R - db.rollback(), f' \; [! N& J
- , h% R, x2 ]9 \9 ]5 B
- def add_locoy(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover):
+ l: i( f- ~$ P' i - reload(sys)( `9 g6 Z3 Y1 [+ [
- sys.setdefaultencoding("gbk"), _1 D. R; @" S5 s/ C
- locoy_url = 'http://www.******.net/locoy/?my=book' #697火车头发接口地址
' N; @: G1 {/ R6 L - locoy_data = {- U3 @: d9 p" m+ \
- 'my_u':'用户名', #后台用户名3 @. w" v4 C& }. o, Z1 G3 B
- 'my_p':'密码', #后台密码( O* R7 ^+ G" L& Y. K8 T' o% o
- 'subject_669977_net':Bookname.encode('gbk', 'ignore'),
) J0 }& }8 Q3 z - 'caid':Cater_Name.encode('gbk', 'ignore')," M2 ]' d# _$ N# U' j" b% @& V4 ^" t' |
- 'title_669977_net':Booktitle.encode('gbk', 'ignore'),3 n# I9 l% R0 a$ S
- 'article':BookConte.encode('gbk', 'ignore'),
. u K. U9 G1 _ - 'author':Book_author.encode('gbk', 'ignore'),
2 C1 P) ^. P8 ?& H - 'ready_1':Book_Palabras.encode('gbk', 'ignore'),, N# y" ]: ~4 D
- 'thumb':Book_img,
D9 c6 N# ^3 z( G - 'content':Book_Introduction.encode('gbk', 'ignore'),& O8 M3 V9 R# N. [. z
- 'abover':abover.encode('gbk', 'ignore')
5 f+ q) T% p0 i: ]# u$ Y& I - }
6 V5 K% s4 H# J - res = requests.post(locoy_url, data=locoy_data)
6 I' s- w$ J) Q+ m/ U, {0 z4 E8 J - print res.text
* U d( F" h* n2 w- r% X - print res.content
6 N/ O J O) v! J) D$ u+ R - # print Dsd
# q$ ^# [* F- X1 b5 i - return res
3 F8 A5 W/ L+ k. A! ~ - : ]* H1 w0 t' ?& i0 F' V3 {
- def __init__(self):
" o) P; R) b6 B! w7 Y$ d# D - self.base_url1 = 'https://www.****.cc/'4 `5 n- A. Z6 H3 s5 s
- self.base_url2 = '/'
& {; N0 z% u& |* ?% h/ p, ]: A% Q - self.CaterId = []
+ f5 s p0 U9 t$ Q. ?1 V5 u6 ] - self.CaterIds = ['xuanhuan', 'wuxia', 'yanqing', 'lishi', 'kehuan', 'kongbu', 'nvsheng'] I$ r( ]6 f. e+ d% S. F% u
- self.page_num = 18 m! w6 W$ T7 M& I
- self.total_num = 200
# s, u g) E( w( J+ q/ c8 v) o - 8 a0 s- X) [ X& `0 o
- @every(minutes=8 * 60)( A i$ W1 t' _! t
- def on_start(self):
; b0 o# a* N( Y: F, {8 E; N, i - global Cater_Name% n3 u! U" B7 b; E4 ^0 H
- Cater_Name = []
! H4 ^; `$ }8 B; ^ - while self.page_num <= self.total_num: i. b Z7 i6 v' p+ q
- for self.CaterId in self.CaterIds:5 c9 S% G3 E8 @, T7 b/ V' K
- if self.CaterId == 'xuanhuan': x( B9 u& R9 b$ }" z* I
- Cater_Name = '玄幻'" n8 z7 C% ~9 h6 U, W
- if self.CaterId == 'wuxia':! h$ \) z9 v% M+ x
- Cater_Name = '武侠'
0 T, B5 y: h, s4 k8 w5 X7 p- m - if self.CaterId == 'lishi':% o N7 j7 m' ^- w2 v
- Cater_Name = '历史'
; Z7 |& m% O( l8 U4 L- o% m! R4 u5 ` - if self.CaterId == 'yanqing':
9 q/ ?4 n" j1 F, V - Cater_Name = '都市' + y8 f, ]( E& R, G+ D. q6 i
- if self.CaterId == 'nvsheng':+ A9 K% y8 p" S& \
- Cater_Name = '都市' 8 q ]9 N( o! r; k3 S" x" q3 j, }
- if self.CaterId == 'kehuan':
6 _ V0 Q& g J6 X1 x4 s - Cater_Name = '科幻' ) [9 J" M5 R; T& M [- V
- if self.CaterId == 'kongbu':/ L2 |0 Y- D6 m* }" U; {5 \
- Cater_Name = '游戏'
' s/ v5 A- V# v; ~+ [7 N3 a - print self.CaterId" `9 a! T$ I$ X
- url = self.base_url1 + str(self.CaterId) + self.base_url2 + str(self.page_num) + "/" ' s: a* ?+ P$ j \
- self.crawl(url, callback=self.list_Caterg,save=Cater_Name)5 {- R7 ~, L* _" g# I+ ^+ D
- self.page_num += 1 9 F* N7 g3 q% a0 d) k7 P+ h
-
5 x3 E/ W4 Y2 m1 {/ I- J - def list_Caterg(self, response):
$ a% h! g5 o9 {3 G! o# A& ^) L - Cater_Name = response.save! e S9 C. {* P, I( {* A/ ? h$ w
- for each in response.doc('.pic-list a[href^="http"]').items():
4 M# |( j4 p; F# Q8 J' A - self.crawl(each.attr.href, callback=self.list_Caterg_detail,save=Cater_Name)9 D3 ?' H4 `9 A q* V' M% d
- 6 B; J' m5 p) J) o% }
- def list_Caterg_detail(self, response):
* B1 N: d: ^; }& H2 T - Cater_Name = response.save
7 {1 ?9 [, V% Z+ E% O; L( j' m - # print Cater_Name8 D8 S6 O% F/ }. u( `
- Bookname = response.doc('h1').text()
) Q# v y2 y4 D - print Bookname- ~% R9 j5 y" `% V
- Book_author = response.doc('.authorname > a').text()
* N% ]6 k$ o9 H3 k- b - # print Book_author3 q1 C5 W- r9 T" b
- Book_Introduction = response.doc('.book-intro > div').text()
$ I. x: L. ?0 z" s: r4 V6 M - # print Book_Introduction* W i7 C4 W) E6 r3 R6 W8 T! d
- Book_Synopsis = response.doc('b').eq(1).text()
. ]/ ?3 D% [( d. |2 m - # print Book_Synopsis5 S9 C# Y" W k
- Book_Palabras = response.doc('.booktitle p').text().split(' ')[1].split('|')[0]4 f6 z$ m" W5 r2 m" B! _
- # print Book_Palabras. Y8 w1 G6 z, ] J
- BookIDs = response.url.split("xiaoshuo/")[-1].split("/")[0] #小说ID
8 Q# L/ I4 G" V - # print BookIDs
5 R! l8 T+ l9 A+ t - Book_Dates = str(datetime.datetime.now())
! R) X! S1 j0 F c/ S' N" B8 a; H - for imgs in response.doc('.bigpic > img[src^="http"]').items():
/ d, i2 F+ {# T3 N) c, s8 j; i' Y - img = imgs.attr.src$ y9 I3 p& F3 ^1 G
- print img1 t& {1 y/ q( v1 N y
- #小说封面下载' {& o: s1 v3 ^ |' V
- extension = self.getExtension(img)' K, V( O+ j* k) k. |) x* {
- name = self.getname(img)
1 |' h6 A* O$ n7 D - file_name = name + "." + extension
% E) t2 l. Y( @. q" o$ K, m - imgDir = P_dir + name
1 ^. f, D5 V! X2 _3 d! E* k5 E/ X - Locaimg = imgDir + "/" + file_name/ ~( P9 o/ _( A
- print Locaimg
* K6 ~3 a3 x/ }6 t2 \9 @& d - if(self.download(P_dir, imgDir, file_name, img)): #这2行可注译,图片下载到本地
( H# q! d8 T$ z5 D4 s/ c9 _' q - print('attachment url is ' + img) #
; x6 A' I- P$ K; i. T3 c+ \ - Datos = {
8 U9 ^* f: c$ f" @ - "Cater_Name":Cater_Name,
& h9 E+ p2 V& r$ I - "Book_author":Book_author,, c" d q' M3 m: i, U
- "Book_Introduction":Book_Introduction,
5 G: {. D: _' }4 x) G+ E - "Book_Synopsis":Book_Synopsis,
0 V; }* l8 Y% g6 y - "Book_Palabras":Book_Palabras,8 U. @6 ?# D, v
- "img":img,
2 a) q. ~& n2 f+ q, V; X - }
4 R4 y6 U; z( ?2 A" h - self.add_BookFile(Bookname, BookIDs, img, Locaimg, Book_Dates) #这行可注译,数据库发布接口,方便其他系统的发布' o) [+ F9 h l1 d0 B+ o- Z
- for each in response.doc('div[class="bookbtn-txt"] a[class="catalogbtn"]').items():
5 m" P/ B/ o0 d* b - self.crawl(each.attr.href, callback=self.index_page,save=Datos)
9 k' v5 j+ N/ }; n( E- H" `, y - 0 l# c2 I& @* R, N- p. W ?
- @config(age=8 * 60 * 60) . Q. ~. u& u* M: A7 D
- def index_page(self, response):
( J. I n" P: z7 H; [# m - Datos = {: R7 [* G4 a- b. H
- "Cater_Name":response.save['Cater_Name'],% k0 H7 X5 ^- V1 K. G: { I1 N
- "Book_author":response.save['Book_author'],. X/ o7 W6 Y5 }9 Q c) L
- "Book_Introduction":response.save['Book_Introduction'],% f, k4 {/ h; E2 F2 Q( q: K
- "Book_Synopsis":response.save['Book_Synopsis'],
6 }/ k6 A3 I$ ^& L; p3 q( @ - "Book_Palabras":response.save['Book_Palabras'],+ \) d6 D9 w* t4 y# }2 `4 d9 N
- "img":response.save['img'], f: U" H8 R( G9 {0 F: ^6 V+ Y
- }" i( e+ _. K$ C
- for each in response.doc('.chapter-list li:first-child a[href^="http"]').items():
9 J- v: |/ D/ B( f9 i0 l - # for each in response.doc('.chapter-list a[href^="http"]').items(): # u0 x F* D& w0 X7 g( q3 C* o
- self.crawl(each.attr.href, callback=self.detail_page,save=Datos)
9 u9 x2 P5 ?3 R& V) Y6 O - @config(priority=2)
9 J/ l$ s- N& a1 p2 w - @catch_status_code_error1 U* p7 D, f% l5 a F
- def detail_page(self, response):
q" }* i" L+ @! k- k$ ]( y8 K - NewRe1 = u'哈书'0 T' P7 o( p/ F
- NewRe2 = u'huhjsd.CC'
9 m x7 l( y2 s6 W - NewRe3 = r'^\\n\\n'
' I7 N7 g5 ~& A* L2 C5 b; B - NewRe5 = u'小说网'! P. E, t% Z% B: i, m
- NewRe6 = u'fgdfgf'$ H7 e& y) v" n7 W
- NewRe7 = u'fgfgf'! x3 b- ^* {/ d! u9 u
- NewRe8 = u'ffhgf'
R/ W/ f8 S5 d% {: h; V - NewRe4 = r'[\f\t\v+\.\{\(\)\}\!\/_,$%^*(+"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+'7 G: @! Q5 o2 @
- ReC1 = u'静思'
! O8 @7 i3 P1 E" t - ReC2 = u'aghgf.com'5 m$ Y7 I7 b W7 m) w |" l+ u
- ReC3 = u'aghgfh.com'
) e9 ^& [- M* r3 z7 f( \3 K9 _ - ReC4 = u''# l7 L1 i, P1 J
- ReC5 = u'文学网'
# A0 b. y) J' E - ReC6 = r'<BR>'1 h0 ~% N5 y) T* d6 {! t7 F: y: i
- Bookname = response.doc('.readlocation a').eq(2).text() #小说名称
; Q7 t& ]3 U f' v" c4 \ - print Bookname
7 C v0 K2 A$ N" P - Cater_Name = response.save['Cater_Name'] # 小说分类: c3 H6 ?2 t/ P7 Y8 a
- Book_author = response.save['Book_author'] #小说作者: k4 g7 O" a6 h$ K, W: {+ M
- Book_Introduction1 = response.save['Book_Introduction'] #小说简介
; Y! D$ F1 K2 f+ J - Book_Synopsis = response.save['Book_Synopsis'] #最近更新
8 f' [. W( f! d/ D" }$ d" [* L - Book_Palabras = response.save['Book_Palabras'] #小说字数/ ^/ J4 ^2 g. V: e
- Bookurl = response.url #小说网址
$ Y; }& {/ D& X/ G9 {1 E% D& Y - Booktitle = response.doc('.article-title').text() #章节名称
! f' C: p" k+ w - BookID = response.doc('.readset-r span').text() #小说ID# _2 {3 Q6 I1 |1 t: q5 d( }, x
- BookConte1 = response.doc('.article-con').text() #小说章节内容2 G [ Q7 R5 w- x4 c) k, s2 d) {
- abover = response.doc('.article-title').text() + response.save['Book_Synopsis'] + response.save['Book_Palabras'] + response.save['Book_Introduction'] #小说状态(连载还是完成); j1 z2 ]. l8 y
- Book_Date = str(datetime.datetime.now()) # 采集时间
1 V3 g0 `' _. v& v+ w+ b( n, s9 Z - BookConte2 = BookConte1.replace(NewRe1 , ReC1)
# i8 `4 T* k! }, I- K, A. |, D# } - BookConte3 = BookConte2.replace(NewRe2 , ReC2)% S2 e7 e- v3 Z* |& _. ], \
- BookConte5 = BookConte3.replace(NewRe5 , ReC5)6 Z7 p& a& u0 |/ f/ M
- BookConte6 = BookConte5.replace(NewRe6 , ReC2)# ?+ @$ R7 y; }- Z, l
- BookConte7 = BookConte6.replace(NewRe7 , ReC2)9 L ~) K0 p2 y% s4 c
- BookConte8 = BookConte7.replace(NewRe3 , ReC6)
; f( z9 V1 B" T' g1 P3 | - BookConte4 = re.sub(NewRe4 , ReC4 , BookConte8)
3 k" W' q- h/ N. W+ Z: Y; S! J - BookConte = BookConte4.replace("\n\n","<br>"). R _- R$ p8 B) J. j) l& h& E8 ]: C
- print BookConte
& d0 _% u, P& u e - Book_Introduction2 = Book_Introduction1.replace(NewRe1 , ReC1)6 C8 G' u" b+ w( v; c
- Book_Introduction3 = Book_Introduction2.replace(NewRe2 , ReC2)
# C& s; V- z" `" y - Book_Introduction4 = Book_Introduction3.replace(NewRe3 , ReC3)1 j5 S9 o q9 p1 ?: r, ^
- Book_Introduction = re.sub(NewRe4 , ReC4 , Book_Introduction4)
+ V' a) g0 l' E6 E1 Y/ S8 v+ c: c# e - Titleid = response.url.split(BookID + "/")[-1].split("/")[0] ! G& L3 j# _8 }9 K1 `+ P' ]+ U8 L
- Book_img = response.save['img'], #小说图片
: Y3 i& _! E% z -
9 a7 [# R. @7 { - #insert into MySQL 小说入库* B: Q0 o. V/ G2 q4 ?) ?* n
- self.add_question(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布) }0 ?7 j9 N/ W3 e! Y2 P
- self.add_comment(Bookname, Booktitle, BookID, Titleid, Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布
! V/ M2 l; F& C( {6 a4 Q - #post提交发布0 Q& Z5 U1 N1 I7 H" q/ ?
- self.add_locoy(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover) #这行可注译,火车头发布接口,不需要可取消
) E7 S3 u& C9 \ h9 w) l# ~ - Datos = {: X& [1 \ j) t' J; P
- "Cater_Name":response.save['Cater_Name'],
: S* l. Y1 \/ I$ @6 o' S# Y# E - "Book_author":response.save['Book_author'],
; l; _' @* `# |! i2 {! ?; G# _0 N - "Book_Introduction":response.save['Book_Introduction'],* b- X! K1 ^6 `# A6 Q ~: ^
- "Book_Synopsis":response.save['Book_Synopsis'],
) G' v% X6 o! Q+ k: \( |7 f" {3 a - "Book_Palabras":response.save['Book_Palabras'],2 F: P6 B9 L4 L: g
- "img":response.save['img'],
' w& k2 I$ K# q( {9 Q I' t - }! Q/ [% B3 t' U5 v3 g& T+ i: y4 X
- for each in response.doc('.articlebtn > a:nth-child(4)[href*="/xiaoshuo"]').items():* I( s Y. E/ v2 u
- self.crawl(each.attr.href, callback=self.detail_page,save=Datos)
+ p X5 @9 U( P, R - return {" D7 N V: ~9 d8 P( ?2 t
- "Cater_Name":Cater_Name,
$ D" d/ x' K3 F1 }$ v/ D4 k - "Bookname":Bookname,( l9 D+ [" @/ |9 `3 i- F- @! w# S6 m
- "Book_author":Book_author,0 Q' r8 Z% z% B
- "Book_Introduction":Book_Introduction,
' o! L4 l( [9 K ^1 i) Y4 q8 S6 l5 U - "Book_Synopsis":Book_Synopsis, M; k) D0 F* M
- "Book_Palabras":Book_Palabras,
( H; A& Y' B: Y" j3 b* p) L$ K8 U - "Book_img":Book_img,% `/ J" V: x" m% U
- "Bookurl": response.url,1 F) f- {9 q" n, z8 @5 @
- "Booktitle": Booktitle,
" Z" b+ Q! A0 o5 Y5 N9 d - "BookID": BookID,7 u3 d# |2 H E1 F1 i+ P* C
- "BookConte": BookConte,
5 Y5 _' P3 @/ q2 A C6 |$ o - "Titleid": Titleid,+ z, L% z$ z1 ^+ E; g' m
- "abover":abover,
0 o. `/ }" [: `2 S6 X+ T v - # "Book_Date" = str(datetime.datetime.now()),( i4 U$ s% O: F4 {8 r# t
- }
. Q* ]9 ]8 y& I9 e8 D' e* j - def download(self, P_dir, imgDir, file_name, Book_img):) L( J4 w; W q' _
- if not os.path.exists(imgDir):
9 O4 i0 M- v+ z0 n' g+ B - os.makedirs(imgDir)+ s3 J6 h+ S) [* v
- file = imgDir + "/" + file_name" M: X2 s& ]' h5 A4 \
- # print file
! O+ T3 }. C' F: Z, ? - f = open(file, 'wb+')9 ^- V q% h% K2 w2 V7 v) ~
- imag = requests.get(Book_img) / S6 C- Y& [! m
- f.write(imag.content)
, j" V7 @6 o( H9 N: j - f.close()" F" `+ y. C3 J. v F# Z! z
- #保存图片前2 ^; A0 k. U/ r5 W! H3 \" s
- def save_imgs(self,response):
" I& L- M+ W+ @ - content = response.content7 U" ]8 x& k. X5 g; O) ]" P5 H, l
- file_name = response.save["file_name"]/ g' p+ S, a& |( E% {# y, _
- imgDir = response.save["imgDir"]
; E" a) f; \9 ~# a1 K3 i - file_path = imgDir + file_name ^ O2 J. |5 F Q8 k- b
- self.save_img(content,imgDir,file_path)
6 U1 w5 W5 `) n) h- Z2 [ - #保存图片$ m+ B7 A2 ]- V, C
- def save_img(self,content,imgDir,path):# D/ V V7 h" g/ ~
- if not os.path.exists(imgDir): ' V# B: I7 p d9 a8 ~* p" Q$ P+ D
- os.makedirs(imgDir)
0 T9 \0 b0 w( Q' w. Y - f = open(path,"wb" )
( j$ F! F- ~% Z7 T0 `; J, K' d - f.write(content)
- J2 t4 }, F* J" {) H - f.close()
2 [) Z6 c, J7 h9 z! I - #获取url后缀名
t* G: J* K& Q& Y* x2 D8 } - def getExtension(self,url): ( A; p" w8 D& f; @1 l
- extension = url.split(".")[-1]
1 A( P0 H( r5 n T - return extension . r9 _ a7 b4 W
-
: G1 t; I. F6 z, W - #获取图片名1 @/ Y( g1 Q, x o$ n5 _0 z. k
- def getname(self,url):* w3 ? |. J& ^9 b
- name=url.split("/")[-1].split(".")[0]: r) I0 d6 i4 W
- return name
复制代码 9 a# d4 c* H5 c/ [2 k/ X
+ P% f8 j7 Y) n# j8 A |