Python + pyspider某小说站的爬虫,入数据库,火车头发布,资源下载到本地,另可写爬虫!
$ q* \7 S# s* H5 N- #!/usr/bin/env python8 r7 K$ Y N5 l2 a% F
- # -*- encoding: utf-8 -*-) |! b$ \% S. g% h' E& j
- # Created on 2019-05-05 21:43:11& O! j* |" p. ]- K& P7 w# X( f
- # Project: XiaoShuo
8 S1 m# x+ ^- ~5 G8 Z( d" y( S -
0 M1 ?9 p: _9 W) k - from pyspider.libs.base_handler import *
3 ]" R: ~4 G D K8 M - import pymysql
: A9 x' p }( @* v9 @ - import random2 G4 T( { ~0 S& |. x2 G
- import datetime9 z9 C' u. q$ t4 |
- import urllib2,HTMLParser,re
' D7 ~ o+ B9 R+ N0 D6 o - import os) f# @1 g- u7 W& m2 P
- import sys
& m5 h) R b9 P$ T n - import re, k; b' }3 B) f
- import codecs! ?9 \# ~) x1 i; C3 v9 G! n+ B
- import requests4 e, j b7 R+ y8 n7 Q: e/ g% ~& d
- import json
, ?/ H$ ]5 ^: @ b1 @" i - . F. e$ ^& r' c' F2 M
- class Handler(BaseHandler):
- w% Q7 X( y* l+ @7 u# h/ V& q - global Datos" G) e5 x0 l: ]: ~/ f( K
- global P_dir
; s0 [9 l* y" V; o6 V - P_dir = '/Tools/Debug/' #采集时候图片保持到本地的路径
) L) ~" N$ i; }7 `0 N - global Datos
2 @8 g; K- s$ n4 Z - Datos = {}
; e. [- O- o: {0 f/ [" q' ^ - headers= {
( `) ^9 o& y2 P3 J - 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',) B6 B; f+ t/ [8 m! U# H
- 'Accept-Encoding':'gzip, deflate, sdch',! \+ q: o# V: w2 T0 V4 E
- 'Accept-Language':'zh-CN,zh;q=0.8',
' W+ l/ n3 {: f! l+ I - 'Cache-Control':'max-age=0',. e ^# S5 W- a3 s
- 'Connection':'keep-alive',0 P B( j! m. W a) p) n
- 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'/ r @$ b* v, |4 |
- }. j4 N2 N5 k) E
- crawl_config = {
8 Q7 a2 m! e) w. K' F8 s# r - 'headers' : headers,8 Z0 c. p" ]) c1 s$ Y @0 }
- 'timeout' : 300
/ f: F, |3 ^ ^ o% A' J$ B - }
& Q( C% z! Q# R! ? - def add_BookFile(self,Bookname, BookIDs, img, Locaimg, Book_Dates):
& P* W% C6 v1 o7 B5 R. a' c - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")) B/ p f* @1 d& b. @
- try:6 e) R& [' s6 s; T( ~1 o8 K. s7 f
- cursor = db.cursor()3 h) _$ I b: J+ j' e, d: a5 G
- #注意此处字符串的占位符要加双引号"%s") H( G/ r, I- ?& [( ?
- sql = 'insert into BookFile(Bookname, BookID, img, Locaimg, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, BookIDs, img, Locaimg, Book_Dates);( p, e% t5 `& X1 U1 _. g
- # print(sql)$ I+ M4 x: R5 ~# r/ k/ G
- cursor.execute(sql): I; B2 }3 R1 U% J2 d3 @
-
/ C2 `# w, A" z1 ^, K - #qid = cursor.lastrowid9 W% A# e4 `' e; ~" a
- #print(qid)
1 Q6 U! W" [4 {, r -
, E/ b7 K) s5 w3 I* K5 \ - db.commit()
$ Q! N4 i4 X3 z6 ` - except Exception as err:
4 v5 o) d1 p' S- z - print("Error %s for execute sql: %s" % (err, sql))# u1 f( X+ j9 D& P" B5 n
- db.rollback()
1 N. U/ |1 o6 v, h6 P o( G; e* K - def add_comment(self,Bookname, Booktitle, BookID, Titleid, Book_Date):* ?( s- r: {6 d' r! C% N, j! M
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")* u- d! C. Z+ e* {2 u4 _8 B
- try:
6 m) Z: Y" H: W& T - cursor = db.cursor()0 D" Y' [( G G9 J- Q: w
- #注意此处字符串的占位符要加双引号"%s"
\) y' b, k7 {) A/ v, i* O1 I1 O6 [ - sql = 'insert into BookTitle(Bookname, Booktitle, BookID, Titleid, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, Booktitle, BookID, Titleid, Book_Date);
4 s t$ U$ q8 u- g8 U: k @# [ - # print(sql)
# y0 N- m4 e, C' g5 J s* X - cursor.execute(sql)
K5 {1 r8 s! G1 ~, V/ B3 t) u - * z* |. B& l" n. X, ^. H
- #qid = cursor.lastrowid0 [. E2 x7 b% p5 S+ [) D
- #print(qid)3 q# p! J4 B+ t; n4 k; _
-
6 s7 e$ ~5 M* D8 S3 a; K, E - db.commit(), [4 U/ ?6 Z5 D: }, U- v. i) v: o
- except Exception as err:2 f$ n: V2 [/ ^( ~0 U; a6 l9 P
- print("Error %s for execute sql: %s" % (err, sql))
% ^ @/ r& I4 g' U- V. n - db.rollback()
2 N) q. B& [2 x1 w - def add_question(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date):7 F) w& p% z% Q0 x
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")6 {+ n" P0 D4 J. R) E
- try:
1 a$ M" O- O) m. [& p0 J" P& G# A - cursor = db.cursor()
; g4 n9 F. H9 Q - #注意此处字符串的占位符要加双引号"%s") J2 z; q$ I/ ^- N1 Q7 G; F0 b
- sql = ' INSERT INTO BookConte (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date);
/ @: g- y% D" r3 n1 m, M# B( n. i" { - print(sql)7 h4 F5 g+ f& p' i2 ^( t7 @
- cursor.execute(sql)
6 Z( Z" p& i; G- U' ~5 F& Z# s* Q - print(cursor.lastrowid)
. n1 L1 e" u% p4 D! t - db.commit()
2 Q' _' O: y2 `/ g - except Exception as err:
+ S1 a1 Z# D' c# G5 |& [9 k J1 V4 ~ - # except:
8 p- ~% _2 j6 c' C - # print('Failed'). O0 E( H* E6 Y/ Q
- print("Error %s for execute sql: %s" % (err, sql))
6 {, w1 I( V, Y& z# z# C8 j9 ]' | - db.rollback()0 g% R% B- L# g, w
- , l9 x. g" L) X a8 C+ m0 ?
- def add_locoy(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover): 3 [+ Q) z5 Z- _' @2 x3 P! T5 k
- reload(sys)
* B. s$ I3 B+ Z$ K+ O+ p& w p - sys.setdefaultencoding("gbk") | Q! m, z& }: E* p9 e
- locoy_url = 'http://www.******.net/locoy/?my=book' #697火车头发接口地址- I. @) ]) T! v0 u5 ~7 k8 A
- locoy_data = {* M1 h& Z+ K0 I) {. q$ N
- 'my_u':'用户名', #后台用户名
5 |: U; N1 a# [: s" N - 'my_p':'密码', #后台密码* P8 _7 j8 R4 O! V# s
- 'subject_669977_net':Bookname.encode('gbk', 'ignore'), g: Z7 t5 `6 j B8 f
- 'caid':Cater_Name.encode('gbk', 'ignore')," l4 D' j$ \/ t
- 'title_669977_net':Booktitle.encode('gbk', 'ignore'),
3 c. b$ X8 `* k m! R2 D - 'article':BookConte.encode('gbk', 'ignore'),
; w& w2 o f% _' `1 U2 z - 'author':Book_author.encode('gbk', 'ignore'),
( ~, B. x( i7 C3 ~# l9 g6 H - 'ready_1':Book_Palabras.encode('gbk', 'ignore'),
. T) k6 G/ g1 G# C& b3 ?. U - 'thumb':Book_img,
& l2 e A' x: `, o - 'content':Book_Introduction.encode('gbk', 'ignore'),3 s' i, Z, N5 T1 ?5 W7 C2 \ u
- 'abover':abover.encode('gbk', 'ignore')
7 {0 B: J; q% h2 ^4 x* e - }7 y5 f4 D7 @! n4 C# f
- res = requests.post(locoy_url, data=locoy_data)
' s8 V4 {: Z0 L% a - print res.text; n) t" C4 {7 `3 |; ?/ a* _) i
- print res.content
) c# p% j3 {: y$ g- N2 X - # print Dsd
7 i8 \! q: d! E2 @: s/ D. k6 } - return res( G2 O/ `0 T1 Y! a8 O
- - i$ T9 _' R) M, ]# L( r: H, e
- def __init__(self):' y& E/ v Z+ f
- self.base_url1 = 'https://www.****.cc/'! s1 h( K( o( W9 l1 ~+ X
- self.base_url2 = '/'
4 F2 }! D2 O3 B$ {, @$ M% ^ - self.CaterId = []/ u8 z5 G# @8 ~1 b8 o' W
- self.CaterIds = ['xuanhuan', 'wuxia', 'yanqing', 'lishi', 'kehuan', 'kongbu', 'nvsheng']
0 K# C: o$ z' x; u, t - self.page_num = 1
7 z) W" m. ~: G, q, I7 t; `- O - self.total_num = 200 % c6 M G3 U1 C4 \" b
-
! i. ^! V; n U( i3 u5 K - @every(minutes=8 * 60) ^9 G9 z1 u" I, p
- def on_start(self):( S8 _) H+ s4 B
- global Cater_Name* v* z( [5 O% d7 ~- c( X
- Cater_Name = []
7 h; W) l6 ~0 R+ I& ~ - while self.page_num <= self.total_num:
# Y/ w! J0 V! \6 H8 Y6 x( ~8 u - for self.CaterId in self.CaterIds:8 D$ z0 A5 Z; i/ \
- if self.CaterId == 'xuanhuan':8 w+ l# D4 L$ h: G- L- M) P8 L) f
- Cater_Name = '玄幻'
) X/ \* ]4 z* }" T4 P - if self.CaterId == 'wuxia':1 g+ v1 u2 t' D! o( F3 W& A2 g4 n/ C
- Cater_Name = '武侠'; b) S$ h3 ~/ X% k, N
- if self.CaterId == 'lishi':
' }3 J% b5 g! V4 }. `. r! i5 M% n5 Q. F - Cater_Name = '历史' , _% e x5 O% L3 Y1 P
- if self.CaterId == 'yanqing':
9 C; x) {. q. h8 e+ t: q - Cater_Name = '都市'
9 I0 [& X6 x) x6 l/ Y - if self.CaterId == 'nvsheng':1 F8 a! M$ v$ [9 j
- Cater_Name = '都市' , i; _* F7 a4 ^3 l0 N7 X
- if self.CaterId == 'kehuan':
, s' O2 V3 c$ W' C - Cater_Name = '科幻' ; V0 v# F, A& W
- if self.CaterId == 'kongbu':
& [1 v' r9 k$ S - Cater_Name = '游戏' 7 m0 C4 Y- j! o& Z! P) P0 I/ m
- print self.CaterId+ Z1 A& x( q* f) a
- url = self.base_url1 + str(self.CaterId) + self.base_url2 + str(self.page_num) + "/"
: w$ B3 W2 M8 n- Z7 D - self.crawl(url, callback=self.list_Caterg,save=Cater_Name)) L% j O# ?+ O1 j& Q
- self.page_num += 1
# M! x. G% P$ w2 u2 x - - R2 V9 |. L' e# t# k
- def list_Caterg(self, response):: ^* i& ^5 _5 N6 M& p1 E7 I
- Cater_Name = response.save
+ ?$ G7 G) W' E" L, K' i' K - for each in response.doc('.pic-list a[href^="http"]').items():- L6 z i9 A0 l! |! u- k: }0 z
- self.crawl(each.attr.href, callback=self.list_Caterg_detail,save=Cater_Name): T& t M2 H5 ^# y3 A- \2 d
-
1 d# {# a+ {8 t. | - def list_Caterg_detail(self, response):- s0 q8 q# D$ w
- Cater_Name = response.save
/ N% U8 _, @2 u! F8 n( z - # print Cater_Name
2 l" _9 C r& [! [2 O G, L - Bookname = response.doc('h1').text()% [3 c, I$ r5 S8 w
- print Bookname0 R$ o- w4 K# c) l
- Book_author = response.doc('.authorname > a').text()4 M- U3 Z* }& @ L1 `
- # print Book_author8 V% v4 z* G/ P: O) V
- Book_Introduction = response.doc('.book-intro > div').text()
9 n; L: h% F4 I- n - # print Book_Introduction
, ?, Q5 j8 N4 s0 b - Book_Synopsis = response.doc('b').eq(1).text()7 g) l& u; ^# m1 M/ H- I) t
- # print Book_Synopsis$ y/ m( c. `* l5 Z/ q- d* e4 T4 L
- Book_Palabras = response.doc('.booktitle p').text().split(' ')[1].split('|')[0]3 @6 K5 h6 x% s/ B( S# }
- # print Book_Palabras
/ a4 R) x5 Y/ k0 h" C - BookIDs = response.url.split("xiaoshuo/")[-1].split("/")[0] #小说ID7 D) l; G2 v) e2 u: u+ Z7 p0 u
- # print BookIDs
1 w6 E0 A' n" J1 M2 \# h' k - Book_Dates = str(datetime.datetime.now()) $ ~5 ]1 T% J V, H3 ]9 T" s1 L) X
- for imgs in response.doc('.bigpic > img[src^="http"]').items():
- ?0 ~, P$ C4 b' }) H - img = imgs.attr.src) K# _. h1 _, z* [! R, ?* H
- print img
' q* @. l+ r6 W0 C6 s" A0 j - #小说封面下载
2 X! k! r& S% V5 m( c- A, r8 j - extension = self.getExtension(img). h' l. q9 a( w1 _# S% Q
- name = self.getname(img)4 P# T' s, a, B! c& u( ]# P
- file_name = name + "." + extension# N$ l5 [+ b$ }: c, U% F
- imgDir = P_dir + name
; M7 u% N) f) b - Locaimg = imgDir + "/" + file_name6 t2 s2 P4 n, N, x6 a/ T3 \$ c! t
- print Locaimg: w5 _7 d& ]4 v0 m5 e9 I
- if(self.download(P_dir, imgDir, file_name, img)): #这2行可注译,图片下载到本地
3 x ?- J! j8 g, _! i. w - print('attachment url is ' + img) #2 Q+ Z% _; F* s/ m& d" @
- Datos = {
8 u" S& G7 `- F, e, [ - "Cater_Name":Cater_Name,- @7 t, z) S* K1 ?0 ~+ U
- "Book_author":Book_author,/ B9 i5 B. W# J& I+ j
- "Book_Introduction":Book_Introduction,
4 T# k) `, w! d - "Book_Synopsis":Book_Synopsis, ]1 G" E5 S$ P7 X( N# r
- "Book_Palabras":Book_Palabras,
5 H1 k- |5 t5 s0 \$ |, C - "img":img,& [# v7 _3 m! ^" Y0 r
- }
- N& {& g3 v6 Z/ ?8 u - self.add_BookFile(Bookname, BookIDs, img, Locaimg, Book_Dates) #这行可注译,数据库发布接口,方便其他系统的发布% a. p( Y: U6 ^' V
- for each in response.doc('div[class="bookbtn-txt"] a[class="catalogbtn"]').items():6 }, F; I9 a* G* [: K* M2 \1 B
- self.crawl(each.attr.href, callback=self.index_page,save=Datos)
8 s, w. p+ v+ |/ L -
* G# ~; P# T3 }5 E8 |* ] - @config(age=8 * 60 * 60)
5 P) A' G- t' b+ [0 N - def index_page(self, response): + F4 Y1 P. A( U
- Datos = { ?8 ~! z. e+ R$ _' _, C+ N( d
- "Cater_Name":response.save['Cater_Name'],
$ I5 c& x2 D9 I$ Y - "Book_author":response.save['Book_author'],
% [, D1 _! Y& n6 E - "Book_Introduction":response.save['Book_Introduction'],
, R6 h1 X& D3 R. W& T0 h - "Book_Synopsis":response.save['Book_Synopsis'],
- F. X3 ] T# ?/ Q- {% B+ H% F - "Book_Palabras":response.save['Book_Palabras']," ]5 t9 t9 {: p h: b, {; F w7 t
- "img":response.save['img'],# _) I, z8 r, k
- }" K* a( \( |. a' C' N- m
- for each in response.doc('.chapter-list li:first-child a[href^="http"]').items(): r! k5 l- m W
- # for each in response.doc('.chapter-list a[href^="http"]').items(): ; o l9 |& E1 v& Y, D
- self.crawl(each.attr.href, callback=self.detail_page,save=Datos)
! \, u9 y; E* B+ Z2 U- i - @config(priority=2)
9 l" p8 G% x% r) B9 l; { - @catch_status_code_error
* b# R3 w# d; Z6 |. e$ G - def detail_page(self, response): " |& Q* x6 ^& X' B% R
- NewRe1 = u'哈书'3 {$ c1 ?" Y7 {: z5 c) E- e
- NewRe2 = u'huhjsd.CC'
6 W* ~1 w0 p! ]. J4 `0 S4 V - NewRe3 = r'^\\n\\n'
. H' n0 T# o* A - NewRe5 = u'小说网'+ U" W- c$ @* H' x: m
- NewRe6 = u'fgdfgf'
+ t2 \( \6 K/ r* y5 b" w# d2 r - NewRe7 = u'fgfgf'
1 Q9 Z( U v7 ?6 ~/ k1 ` - NewRe8 = u'ffhgf'$ e( W/ q7 s6 s* Q J; T+ g! V
- NewRe4 = r'[\f\t\v+\.\{\(\)\}\!\/_,$%^*(+"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+'0 a* Y+ ?* H! O4 Q" t/ r7 l1 y
- ReC1 = u'静思'
2 ?* a! L: @& x - ReC2 = u'aghgf.com'% Y6 U" p6 G1 G. Y2 M; F
- ReC3 = u'aghgfh.com'
9 P5 E" Y! e) {0 x# P/ ?$ _; G - ReC4 = u''
1 q8 H+ l$ J1 ~% O% o - ReC5 = u'文学网'- H- P5 X# s$ ?" ?
- ReC6 = r'<BR>'
/ _0 n, d' V6 e, G2 r: ^# G - Bookname = response.doc('.readlocation a').eq(2).text() #小说名称3 ?8 d( T# T* z5 f) M% t
- print Bookname# D( C# E, p8 H& Y* B, L' l
- Cater_Name = response.save['Cater_Name'] # 小说分类
7 ~- N" b& N) B - Book_author = response.save['Book_author'] #小说作者
2 N, Q0 E7 y8 O! o - Book_Introduction1 = response.save['Book_Introduction'] #小说简介
' _$ |8 B2 P8 V- q% c - Book_Synopsis = response.save['Book_Synopsis'] #最近更新
8 ]3 N& @, h0 v2 C - Book_Palabras = response.save['Book_Palabras'] #小说字数
7 ~. K6 k2 q, z* j - Bookurl = response.url #小说网址' W9 t6 s4 B2 D+ O( \( ]
- Booktitle = response.doc('.article-title').text() #章节名称- n0 W: ]- \5 ]+ `
- BookID = response.doc('.readset-r span').text() #小说ID. a1 H6 Y5 S" K: H5 P
- BookConte1 = response.doc('.article-con').text() #小说章节内容
- y R6 P- Z: C0 b% `! H - abover = response.doc('.article-title').text() + response.save['Book_Synopsis'] + response.save['Book_Palabras'] + response.save['Book_Introduction'] #小说状态(连载还是完成)
; p" @0 }* S4 A# ~- \1 x4 k - Book_Date = str(datetime.datetime.now()) # 采集时间# U+ N3 r) X" S9 v6 s, K
- BookConte2 = BookConte1.replace(NewRe1 , ReC1)
2 V6 C+ I2 w8 p, o4 p' I - BookConte3 = BookConte2.replace(NewRe2 , ReC2)
1 Y0 _# l* k" f; a& ?$ \ - BookConte5 = BookConte3.replace(NewRe5 , ReC5)
9 A6 o' I* }* v- G; {% B- a- F5 j - BookConte6 = BookConte5.replace(NewRe6 , ReC2)
; l7 ^' T$ K2 d6 q+ k4 R - BookConte7 = BookConte6.replace(NewRe7 , ReC2)
% \2 E2 Y% u% o0 Z9 ~ - BookConte8 = BookConte7.replace(NewRe3 , ReC6)
; f! N5 D/ V& C8 W$ G, |- {1 B' f+ V - BookConte4 = re.sub(NewRe4 , ReC4 , BookConte8)
! M8 Q9 P0 w$ r4 v$ x( X$ e; p - BookConte = BookConte4.replace("\n\n","<br>")7 G( S0 s/ ~" a0 H" @6 P3 v
- print BookConte* K: L' u: i5 l! P6 |% N: O
- Book_Introduction2 = Book_Introduction1.replace(NewRe1 , ReC1)
: N! _, y: \6 s - Book_Introduction3 = Book_Introduction2.replace(NewRe2 , ReC2)8 S# x& J1 O! y! W
- Book_Introduction4 = Book_Introduction3.replace(NewRe3 , ReC3)
7 o9 s, D }% a" s/ H% y% x - Book_Introduction = re.sub(NewRe4 , ReC4 , Book_Introduction4)
, D5 g/ @8 k0 t) x/ Q4 T - Titleid = response.url.split(BookID + "/")[-1].split("/")[0] ' d: V- q! w! K: Z
- Book_img = response.save['img'], #小说图片" t' [% ~+ X/ z; r
- / V4 E5 T( @5 M6 k+ x
- #insert into MySQL 小说入库
6 d, i6 K& a& l. |( h - self.add_question(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布. g1 S: e6 v5 o8 K- E4 i6 z2 L
- self.add_comment(Bookname, Booktitle, BookID, Titleid, Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布
4 U# f2 Q# A( `( k - #post提交发布
% O5 o1 N9 p: z - self.add_locoy(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover) #这行可注译,火车头发布接口,不需要可取消
3 K, T I8 D9 N2 h( `2 z - Datos = {* q8 A Q- H: p% n! y' H" w" ~
- "Cater_Name":response.save['Cater_Name'],
2 q) p) I$ v; i6 W - "Book_author":response.save['Book_author'],8 R# ?! M) s( j9 e
- "Book_Introduction":response.save['Book_Introduction'],' r5 G: F9 p% s X$ r- ?& |1 a/ k
- "Book_Synopsis":response.save['Book_Synopsis'],
* e# Y7 P: @+ Z T - "Book_Palabras":response.save['Book_Palabras'],; e# c* j0 t/ s6 c8 b, v* E0 G
- "img":response.save['img'],
1 G9 i' b: M/ D; {; Y* c- e% { - }
' _9 \, Z- \* f1 [7 v6 m- } - for each in response.doc('.articlebtn > a:nth-child(4)[href*="/xiaoshuo"]').items():
6 a' j6 M1 X' J& u0 d - self.crawl(each.attr.href, callback=self.detail_page,save=Datos) * Q' P2 W1 ^/ j$ g4 Y% h
- return {
+ n- L+ @* q8 g' _ - "Cater_Name":Cater_Name,& r$ {$ r: n' d) J
- "Bookname":Bookname,) d% J& [) z& E2 p% H& q
- "Book_author":Book_author,* |% x6 A- H+ L/ @: U' h8 F
- "Book_Introduction":Book_Introduction,& F0 t7 i+ |* ?/ j4 o0 q0 _7 B9 G; P
- "Book_Synopsis":Book_Synopsis,
4 j9 U- _- m V+ N/ Z - "Book_Palabras":Book_Palabras,, K' x1 U/ M! Q3 E2 ?( ^
- "Book_img":Book_img,: k" L5 `& e4 w- G: H, Q
- "Bookurl": response.url,- ~6 p# O7 @6 r/ N+ Y6 `
- "Booktitle": Booktitle,
/ ]2 ~* U3 b3 N+ v( D' ~ - "BookID": BookID,
! {# v4 O3 N/ v, u - "BookConte": BookConte,3 ^% e2 q8 R- E; c6 _
- "Titleid": Titleid,$ ^+ B8 V c6 A6 M4 I" o* H
- "abover":abover,
0 P# B7 ~9 }+ P' L: ~0 |. C - # "Book_Date" = str(datetime.datetime.now()),6 Y8 e) |$ @5 H
- }: J- U8 ~7 i; h: S
- def download(self, P_dir, imgDir, file_name, Book_img):
9 M) j( b- }9 V1 [ - if not os.path.exists(imgDir):
% @( {: E6 | g. O& k8 P2 P. T7 p - os.makedirs(imgDir)
2 o7 U T. \1 M# `+ \1 O - file = imgDir + "/" + file_name
+ O% X* B/ C; |$ b. e" P3 G - # print file3 b+ E7 h; n7 M, N0 X; o
- f = open(file, 'wb+')) c) _( `' B. g) X. N
- imag = requests.get(Book_img) $ V) H& u0 w/ c) @6 s4 l/ |4 o
- f.write(imag.content)/ E1 A" v" w' [4 S6 t/ F* L
- f.close()
, ^& x d; ]; {7 j - #保存图片前6 K0 W% i& C5 }+ K( }: x3 s( U v
- def save_imgs(self,response):6 `2 b0 H" i* y5 r
- content = response.content
0 j- |( Y& }$ s - file_name = response.save["file_name"]$ ] g& E: q$ X6 G! w
- imgDir = response.save["imgDir"]
( f% o; l. [2 W! p - file_path = imgDir + file_name; j& Y' a5 Q: |. X( E
- self.save_img(content,imgDir,file_path)
7 o, \( i6 w+ c0 L( m5 ~+ O - #保存图片" `( w1 O+ N+ @; Z. k9 D; T. m
- def save_img(self,content,imgDir,path):+ V# ?+ Z% J8 O7 s& }' @5 c/ z
- if not os.path.exists(imgDir): 6 R5 d! c' f; H! Z
- os.makedirs(imgDir)
3 y7 x' U* _7 ^6 E% j - f = open(path,"wb" )1 I9 w% M7 W2 V& J% D0 K/ \
- f.write(content)
2 P9 ~ D" S* Z3 z4 G, Q - f.close()
0 ^7 F, S# z3 T# u$ T - #获取url后缀名% m, s6 _' q# [2 f
- def getExtension(self,url): ! l( ^3 O2 s) q, @$ d
- extension = url.split(".")[-1]- l, ?4 P! K# ]
- return extension
5 m- x1 n) ^4 I+ a - ! W. X( W1 Y0 h; [
- #获取图片名. R/ J4 K. `0 X3 g: H
- def getname(self,url):
5 p* |. t$ O- l - name=url.split("/")[-1].split(".")[0]
6 r( ^( O4 V U! r5 Z4 v - return name
复制代码 # {5 o2 e1 D: N5 U
/ S, ^2 E) g1 v: M& T' Q+ I |