Python + pyspider某小说站的爬虫,入数据库,火车头发布,资源下载到本地,另可写爬虫!
- H( h& R( o8 p: E5 B- #!/usr/bin/env python, Q6 n' E- b Q7 |2 g9 `
- # -*- encoding: utf-8 -*-
$ ]& ^- j4 p, K9 d - # Created on 2019-05-05 21:43:11
+ M8 W _9 U/ [$ L - # Project: XiaoShuo. l& E; ?" B& H. s
-
. Z1 |1 b$ k6 {, }9 } - from pyspider.libs.base_handler import *
: Z4 D j N, T4 ?7 _$ z/ B7 ` - import pymysql
# `- i e% X2 S7 h- q3 O% w4 H - import random
% W$ ~2 r, h' I0 O7 G. U - import datetime3 I# r" R0 n0 ?, m
- import urllib2,HTMLParser,re% T. z' H- I: _, P
- import os
* y' H% e+ t5 O# L# C! N8 u - import sys5 R& ~( g0 ^* m4 L! Q
- import re
1 H! _8 J( T2 ]" E5 y# r# } - import codecs1 v! |6 T* |1 w9 j$ e% d6 B$ W
- import requests$ a# b! y) H- ?( J. Q- j2 g
- import json/ k0 p, b2 }. S/ k( w/ ?; J# _9 j( ~
- * v" @/ X& B- n" v5 U* B
- class Handler(BaseHandler):/ w# B, ]4 p( d$ ]
- global Datos
- R, r) Q2 P p4 E - global P_dir % z0 r: k4 O& X' ^8 T9 i* n
- P_dir = '/Tools/Debug/' #采集时候图片保持到本地的路径/ E0 S6 i ]; v1 b
- global Datos
% O2 {+ Q5 C- }) W6 l- P/ v - Datos = {}
) f' f5 E/ m/ w. t - headers= {& J3 |( B+ T. n: X1 u; l- l! D
- 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
8 ]- o7 p0 F0 Z8 w - 'Accept-Encoding':'gzip, deflate, sdch',
+ }/ c' _/ ]: A1 a1 \ - 'Accept-Language':'zh-CN,zh;q=0.8',
: x0 j h: A, Q5 y B b; |, o - 'Cache-Control':'max-age=0',
: R" D$ [. D, }5 B* u - 'Connection':'keep-alive',9 k" J& `; R0 r
- 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'
4 H4 u& K% x( M6 n. `3 W9 W# ]' q; | - }
, l! i( {% ?% [3 Y5 v2 g0 p - crawl_config = {& q5 c% b) f* |7 M3 u# T
- 'headers' : headers,
: Y( z4 g1 M5 z z8 @. k, s6 e - 'timeout' : 300
% u& a* {* ?5 u( |# B - }
& J& f. |5 v0 | _' ?- n2 j* ^ - def add_BookFile(self,Bookname, BookIDs, img, Locaimg, Book_Dates):4 c7 x# l& w, {& s/ X3 P4 M, d4 H& K
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")8 B) L+ R4 x3 r7 H, M
- try:% `9 h6 m% p: Q5 P; @" n
- cursor = db.cursor()
: a0 m/ u, J [9 ~# m - #注意此处字符串的占位符要加双引号"%s"
8 c( F) c$ J" b8 B' r - sql = 'insert into BookFile(Bookname, BookID, img, Locaimg, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, BookIDs, img, Locaimg, Book_Dates);
% P! l( i8 \7 C& D1 A - # print(sql)
& p" v$ A7 _+ D7 O" t4 B9 p4 O. u - cursor.execute(sql)
# Q% m; F. R+ T/ Y - ( N9 V/ ~4 Z; \
- #qid = cursor.lastrowid
" s8 i3 o. E" @5 q$ i' g - #print(qid)
9 @% ^, h2 `! }% z - 8 r; c) }6 x7 |# P
- db.commit()) j% M7 ]% F+ t* H' |- h6 ]$ a' `
- except Exception as err:
8 ~0 p! e$ {, N! ]! F7 k - print("Error %s for execute sql: %s" % (err, sql))
. C7 x P" Z, Y5 U6 I2 d$ N - db.rollback()4 }" V+ [. u" ^! k3 F5 s
- def add_comment(self,Bookname, Booktitle, BookID, Titleid, Book_Date):
/ j7 C+ ]4 [) D ] i+ ]1 y7 _ - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")0 b K: B6 q- n( N$ ]
- try:
0 x. M& M- r9 B, q& w - cursor = db.cursor()
7 w: b, _6 l+ K - #注意此处字符串的占位符要加双引号"%s"$ I# r' \* w' ^9 [
- sql = 'insert into BookTitle(Bookname, Booktitle, BookID, Titleid, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, Booktitle, BookID, Titleid, Book_Date);
: d7 L0 y, e4 B - # print(sql)1 J8 Z8 N' W3 w' D l' r4 A
- cursor.execute(sql)1 ]. i( w6 E2 r) n. z
- - ? a$ }3 L$ Q1 e5 z
- #qid = cursor.lastrowid/ x0 R5 j( i0 q3 A+ a
- #print(qid)5 X7 D3 e$ ^% Z; Y
-
; K' J$ w( C4 D, Q0 @" P0 ^ z4 H x - db.commit()5 b5 ?$ r" v2 H: Z5 y5 |: {
- except Exception as err:
4 |- Q/ G& n% g) z7 o/ ? - print("Error %s for execute sql: %s" % (err, sql))- V7 i0 B; }1 p8 |- G; j* z
- db.rollback()
! o: J, r; G8 E5 K% F# [ - def add_question(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date):
7 d2 \' s& c& r6 D1 j8 t9 `5 z9 R, X - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
0 e4 W, i. q# y) ~! L% x! i - try:0 j* O& V; ~4 ?8 J' U
- cursor = db.cursor()+ f' h* E' R/ r
- #注意此处字符串的占位符要加双引号"%s"! q3 W6 [9 R# \, H( G5 K# D5 {
- sql = ' INSERT INTO BookConte (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date);3 K2 ~3 i& ~! O0 D" r! B
- print(sql)
; P- k; b6 t h5 c9 \2 ]9 s - cursor.execute(sql)
1 \" z2 v) b. c) ]' E9 V8 U7 _ - print(cursor.lastrowid)
% n9 s9 i: S: H9 k - db.commit()
; q2 Q4 m; y7 E4 }+ ~. X9 ?1 ` - except Exception as err:; t5 w; V$ Y/ m
- # except:8 T {1 D7 }# Y9 S4 m+ s, t
- # print('Failed')
% P2 W8 E) c1 ^! X3 z! i - print("Error %s for execute sql: %s" % (err, sql))
. q6 I# n; L1 p- p& z - db.rollback()
4 z* E7 Q, [) G* J7 | -
' H0 Q7 v7 V9 V- [3 l4 ? - def add_locoy(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover): & U" v a2 F8 K
- reload(sys)1 |5 U; G4 \3 _8 O5 @+ ]- q) n
- sys.setdefaultencoding("gbk")- b! Z, L/ e8 R7 x5 j
- locoy_url = 'http://www.******.net/locoy/?my=book' #697火车头发接口地址: b" J: W9 y3 V9 i
- locoy_data = {. U# o( k0 n3 f2 s6 X) `
- 'my_u':'用户名', #后台用户名
1 M, N, g5 o' x0 Y% P2 @ - 'my_p':'密码', #后台密码1 h$ G/ U7 R6 _" Q1 \& N
- 'subject_669977_net':Bookname.encode('gbk', 'ignore'),
) s6 ~$ u' E# x/ |8 U7 [ - 'caid':Cater_Name.encode('gbk', 'ignore'),( r3 x: S2 E% k2 v7 t9 h3 S
- 'title_669977_net':Booktitle.encode('gbk', 'ignore'),
) G4 z0 ~' t, K# g. ?. ^& J9 K - 'article':BookConte.encode('gbk', 'ignore'),* @. |3 p' j6 x2 ^7 ^
- 'author':Book_author.encode('gbk', 'ignore'),
# N/ |( X+ |" d) Y5 V" ]4 D - 'ready_1':Book_Palabras.encode('gbk', 'ignore'),4 [8 h9 t. H" e. x) j4 A
- 'thumb':Book_img,' |. g% ^3 n+ [$ M6 P
- 'content':Book_Introduction.encode('gbk', 'ignore')," N) Z1 X9 p+ O4 g
- 'abover':abover.encode('gbk', 'ignore')
( b K3 G- o J s - }% K: y* T) b: k5 O6 c9 Q4 O p
- res = requests.post(locoy_url, data=locoy_data): C+ l. e% s& F- H
- print res.text9 }' h0 w: @ a7 j, w0 D. V
- print res.content3 w5 b" s% v! Q' x! l$ Y2 f
- # print Dsd
) _& K( L! r; k+ q9 M$ A - return res9 n8 F% K* s! q4 M& ]8 D
-
, n% g/ Z& p& L1 z& Q- g6 s& y9 ] - def __init__(self):8 |6 [: \" v" F' m: x
- self.base_url1 = 'https://www.****.cc/'% ]" t1 |# Z% d" T3 q0 ^3 v
- self.base_url2 = '/'
7 F9 p2 c, m/ V# ^ - self.CaterId = []9 A M8 ^+ ~$ Z) _7 H
- self.CaterIds = ['xuanhuan', 'wuxia', 'yanqing', 'lishi', 'kehuan', 'kongbu', 'nvsheng']' j' U8 j# q! U3 j5 A p
- self.page_num = 1$ e7 k5 B2 y n |9 c P& t; }* P6 A2 }
- self.total_num = 200
- `- R$ B, V" s0 ~$ x- _3 g# J6 ` - ( ~; I2 {3 S1 ~$ w8 P9 N" K: |
- @every(minutes=8 * 60)
5 b& g* X$ \: _- l# r- D - def on_start(self):5 [# U" \1 o+ w: F! {5 J* z1 _3 C" U4 B
- global Cater_Name0 a1 q, C0 ~$ w* p+ c5 l9 T
- Cater_Name = []
% h1 O. T4 c" B* M - while self.page_num <= self.total_num: + T0 h7 D. E: `6 N- R( I$ }9 l
- for self.CaterId in self.CaterIds:
/ \8 }( B6 l" Y, `; } - if self.CaterId == 'xuanhuan':
" s$ O! W# ^0 x0 ]" E! D - Cater_Name = '玄幻'6 `5 X: l3 J0 M" \; z' Y9 R
- if self.CaterId == 'wuxia':
2 V: _& P: O) L5 Z1 P - Cater_Name = '武侠'
, E. K5 F8 E- j$ L& q - if self.CaterId == 'lishi':
, [5 V4 `* g- H& S - Cater_Name = '历史'
' l8 V0 O* A8 W% d& z$ w$ a - if self.CaterId == 'yanqing':
; e2 A3 P- i1 p% W - Cater_Name = '都市'
! I* E8 [2 q. i7 Y C; M: f: G - if self.CaterId == 'nvsheng':/ Y$ {: s. [8 {3 q3 F
- Cater_Name = '都市'
3 [' P3 k) T7 b - if self.CaterId == 'kehuan':& i2 R6 ?1 I) b) f9 e6 c
- Cater_Name = '科幻'
) E; z, c/ ^0 H9 \2 h - if self.CaterId == 'kongbu':
: k% T% W3 X- n/ P+ i - Cater_Name = '游戏'
$ n2 w; h2 R% c0 w1 w' j - print self.CaterId' E4 t6 G, m3 C7 o5 `
- url = self.base_url1 + str(self.CaterId) + self.base_url2 + str(self.page_num) + "/" & K! y2 Z% ]& B3 x4 i
- self.crawl(url, callback=self.list_Caterg,save=Cater_Name)& N, [8 K' u! M6 E; R" Z
- self.page_num += 1 5 x+ ?$ T6 a4 l' P
- ; O# l6 t' u# H. _3 `( r
- def list_Caterg(self, response):
, [% o G* |8 J" w K* ^5 d - Cater_Name = response.save
: h0 l1 n- i; i b S( @* C" \ - for each in response.doc('.pic-list a[href^="http"]').items():
$ T$ R! P; }8 g, g - self.crawl(each.attr.href, callback=self.list_Caterg_detail,save=Cater_Name)
& `7 I5 U2 r+ `, I* @( D -
* Z3 S9 l, }) Z8 X" b - def list_Caterg_detail(self, response):' t8 R* k* y$ M
- Cater_Name = response.save
6 ^ M3 R2 p1 S y: c4 Q; E - # print Cater_Name
$ _# e' _* q E2 u X - Bookname = response.doc('h1').text()- i0 _8 Q6 D/ h; Q
- print Bookname
* ]. m: Z( K% { - Book_author = response.doc('.authorname > a').text()# J) U- M) e+ l, |, x& ]: g: L
- # print Book_author4 R$ A; A$ _+ c! ~$ h5 c
- Book_Introduction = response.doc('.book-intro > div').text()
! i9 C/ M4 K+ y/ R; |; x - # print Book_Introduction
: D5 r' g7 g: s4 G) e+ l - Book_Synopsis = response.doc('b').eq(1).text()9 W( ~+ l% B1 W, w8 e
- # print Book_Synopsis0 ~( f `$ U. G) k; y& @
- Book_Palabras = response.doc('.booktitle p').text().split(' ')[1].split('|')[0]: I; f$ F0 \2 l$ W
- # print Book_Palabras- n$ q) V$ I: I( T- ?
- BookIDs = response.url.split("xiaoshuo/")[-1].split("/")[0] #小说ID8 L" Q+ [ W4 ]( _, w
- # print BookIDs- Q7 d* ]3 T2 d! ?1 R$ T( Y4 G
- Book_Dates = str(datetime.datetime.now())
! Y3 J' }) L1 I7 e! L* d - for imgs in response.doc('.bigpic > img[src^="http"]').items():$ ^! j- s6 c- j/ R6 K. A m1 q
- img = imgs.attr.src$ |1 }- C7 x% f. i' u: a
- print img
; }% V/ A9 F+ l( |: a9 E - #小说封面下载
' F2 f) T0 L2 E& X. \) N - extension = self.getExtension(img)
H- ?4 |- q/ J8 ~$ T - name = self.getname(img)7 H& l9 l* I8 S0 q2 g2 M
- file_name = name + "." + extension
+ v) c4 O) n) D: s7 c- p# @ - imgDir = P_dir + name! A; G. L- J( E! v
- Locaimg = imgDir + "/" + file_name! k0 c+ I! z3 k8 S
- print Locaimg4 c8 O5 u9 z5 K) N2 S. F |5 I
- if(self.download(P_dir, imgDir, file_name, img)): #这2行可注译,图片下载到本地! ?9 l) j( k' h% H3 r
- print('attachment url is ' + img) #
" } v4 Q/ E8 y* e7 a/ ]: c - Datos = {
2 Z! c$ Y! J& s+ |8 B1 L+ G& K - "Cater_Name":Cater_Name,5 @8 q! }5 [! C/ m! B8 s
- "Book_author":Book_author,
/ J# a3 N9 m5 F: T, r - "Book_Introduction":Book_Introduction,& A4 v9 a, X: G( f
- "Book_Synopsis":Book_Synopsis,3 i6 @) g2 C& q! [
- "Book_Palabras":Book_Palabras,, a) s( \% \$ y1 F! N" @% V$ h
- "img":img,
7 ]$ V- ^: m$ h$ } - }
0 i2 D8 l1 y* D( \# |6 Y% j% g - self.add_BookFile(Bookname, BookIDs, img, Locaimg, Book_Dates) #这行可注译,数据库发布接口,方便其他系统的发布+ I8 w5 {1 s( |6 B7 ?, n! R0 C
- for each in response.doc('div[class="bookbtn-txt"] a[class="catalogbtn"]').items():/ e# }3 r$ X) k R% n% l2 m/ Y9 Z: d
- self.crawl(each.attr.href, callback=self.index_page,save=Datos)! _9 R1 o' Q8 Q: l
-
7 W. z- a% s4 J( v s" e( Q - @config(age=8 * 60 * 60)
+ o* B# C+ y9 `$ Y1 f+ W - def index_page(self, response): 8 G* D# j% _ f7 K
- Datos = {
& Y& |) {( s$ w% ] v8 [$ o! T - "Cater_Name":response.save['Cater_Name'],
' { I; G# W1 l# L* t+ U - "Book_author":response.save['Book_author'],
1 ^; i1 x- e* l( v. t* `% p - "Book_Introduction":response.save['Book_Introduction'],& j! D! \) G9 N( x
- "Book_Synopsis":response.save['Book_Synopsis'],
8 ?5 |$ p! y3 [7 P: j& s+ o. e - "Book_Palabras":response.save['Book_Palabras'],
^9 h' j# q& m; T3 B5 N+ I: D4 V - "img":response.save['img'],
- e; r1 n3 y5 S: \3 a3 H" g# O - }
* T. Q9 g) R4 z$ z3 p - for each in response.doc('.chapter-list li:first-child a[href^="http"]').items():7 a: z4 o/ W+ y7 X( T- C7 g( U
- # for each in response.doc('.chapter-list a[href^="http"]').items():
4 _3 b5 F' F" w( K# v2 k - self.crawl(each.attr.href, callback=self.detail_page,save=Datos)
) y7 T3 l9 x0 p2 x9 v$ [0 t- s1 L - @config(priority=2)
: G* D( y3 [- G0 } - @catch_status_code_error
9 k8 J5 S3 E) }, y0 o - def detail_page(self, response):
. H& a) u% b& v: ]! V - NewRe1 = u'哈书'
" S4 [- Y- Q8 Y1 o - NewRe2 = u'huhjsd.CC'* _4 U+ P1 P2 a s
- NewRe3 = r'^\\n\\n'
, l8 \6 F% E1 g3 g4 S - NewRe5 = u'小说网'
) F6 T# V; n8 r( ^ - NewRe6 = u'fgdfgf'% I. ?; d. ?& a4 o% |
- NewRe7 = u'fgfgf'
1 Z5 K+ h* A' @1 Z0 M& f" y) ~ - NewRe8 = u'ffhgf'1 U* e4 {: W/ Z9 E& K/ W
- NewRe4 = r'[\f\t\v+\.\{\(\)\}\!\/_,$%^*(+"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+'+ [; Y' M2 @( f O1 D5 ~ u' Z
- ReC1 = u'静思'
: H0 h( T* N6 v% Z' L - ReC2 = u'aghgf.com'
9 A) ~( _* A; u6 o% J7 j, k - ReC3 = u'aghgfh.com', ^# y+ `' j \" {3 ?. e: ^; `% B
- ReC4 = u''
K3 L- f2 K$ S0 m# A5 W7 \- M - ReC5 = u'文学网'
: [* D! p( B+ U$ T( ~' z - ReC6 = r'<BR>'+ ^ L% L. f8 m, I* J5 b" D4 E! A
- Bookname = response.doc('.readlocation a').eq(2).text() #小说名称& U: O4 I* I# S0 `5 k4 J
- print Bookname: U; ]( \( l) }" V8 g4 j. q4 u. i
- Cater_Name = response.save['Cater_Name'] # 小说分类
% W2 i, Z& z8 E5 O/ z - Book_author = response.save['Book_author'] #小说作者! m, ]+ r8 |, M* B5 Q0 l4 `
- Book_Introduction1 = response.save['Book_Introduction'] #小说简介) f& D/ F0 ]+ x- }$ I. m6 Y
- Book_Synopsis = response.save['Book_Synopsis'] #最近更新5 Q% k- X9 ~8 y$ h
- Book_Palabras = response.save['Book_Palabras'] #小说字数! M1 h4 N* z' `3 v2 G
- Bookurl = response.url #小说网址
, P6 ]0 u9 @5 ^8 h+ ? - Booktitle = response.doc('.article-title').text() #章节名称0 P4 U4 Z7 M& ~
- BookID = response.doc('.readset-r span').text() #小说ID8 o- E6 T' y& d" @
- BookConte1 = response.doc('.article-con').text() #小说章节内容
# R& m' Z3 L5 P- n, I8 e. { - abover = response.doc('.article-title').text() + response.save['Book_Synopsis'] + response.save['Book_Palabras'] + response.save['Book_Introduction'] #小说状态(连载还是完成). h% u' e, G( |: c$ ~0 T: @- L
- Book_Date = str(datetime.datetime.now()) # 采集时间: _, C O. u$ o
- BookConte2 = BookConte1.replace(NewRe1 , ReC1). \0 @& i9 h3 D6 ^, W
- BookConte3 = BookConte2.replace(NewRe2 , ReC2)" Z+ }5 C: U( I' ?5 `9 L
- BookConte5 = BookConte3.replace(NewRe5 , ReC5)+ h) M' e9 O; |$ d z3 J/ | Z
- BookConte6 = BookConte5.replace(NewRe6 , ReC2)
7 T1 `1 m; u4 Q - BookConte7 = BookConte6.replace(NewRe7 , ReC2) v- M- i2 f- V- g
- BookConte8 = BookConte7.replace(NewRe3 , ReC6)2 n0 o7 r1 s1 ~& B, P9 j5 D& L# P
- BookConte4 = re.sub(NewRe4 , ReC4 , BookConte8)' [1 ^8 `' C% E3 r5 Y6 E, I
- BookConte = BookConte4.replace("\n\n","<br>")
7 a" o4 t* G" v0 x8 C0 R8 F4 y" Y - print BookConte% { a, o1 F+ I& x' g; z# t
- Book_Introduction2 = Book_Introduction1.replace(NewRe1 , ReC1); ~8 H' Y6 g$ N9 U' |
- Book_Introduction3 = Book_Introduction2.replace(NewRe2 , ReC2)7 |0 `( r# m+ t
- Book_Introduction4 = Book_Introduction3.replace(NewRe3 , ReC3)" U5 _$ l- i( F
- Book_Introduction = re.sub(NewRe4 , ReC4 , Book_Introduction4)( i) f" ^; y' U; X$ [; g! \
- Titleid = response.url.split(BookID + "/")[-1].split("/")[0]
; |: W7 B# b; C/ q A2 h, j - Book_img = response.save['img'], #小说图片
! N$ N1 B# Y% ` - . Z5 h q. g2 Z! d2 l& i4 _
- #insert into MySQL 小说入库5 h% I3 l9 e1 P! ]( n8 j
- self.add_question(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布5 j1 m0 H/ P4 n9 s( L
- self.add_comment(Bookname, Booktitle, BookID, Titleid, Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布
9 y, L" _3 ]( \% i B4 l - #post提交发布 r! N( Q, a7 Z# q& J
- self.add_locoy(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover) #这行可注译,火车头发布接口,不需要可取消
- O9 n" P n5 t: V - Datos = {
% V* k8 r7 Q3 E5 \5 o$ E3 L; H) q* C - "Cater_Name":response.save['Cater_Name'],4 t9 } B3 i! h' X. J
- "Book_author":response.save['Book_author'],
6 A" M: U1 @2 b2 E4 n0 Y - "Book_Introduction":response.save['Book_Introduction'],+ C- R# N7 Y2 I# { F0 ]
- "Book_Synopsis":response.save['Book_Synopsis'],
( V2 j. M# r2 f) f, m - "Book_Palabras":response.save['Book_Palabras'],: I9 s2 {( h: z% |; J( C
- "img":response.save['img'],
! z. B. J& I" v: h* v2 `* q - }0 j' f" ^* m' g2 q5 _8 K/ D; W% T
- for each in response.doc('.articlebtn > a:nth-child(4)[href*="/xiaoshuo"]').items():
* M. k8 q; M: Z1 N2 v - self.crawl(each.attr.href, callback=self.detail_page,save=Datos)
" T" M, ]8 c9 H$ |/ W - return {
9 T5 I: R* j/ A - "Cater_Name":Cater_Name,
# v8 H& W" u( M5 w, H - "Bookname":Bookname,
# Y* \* _2 ]0 a; C6 k - "Book_author":Book_author,- |7 n2 Y0 j7 F/ q
- "Book_Introduction":Book_Introduction, P" ^% l0 |# W# w3 G7 ]
- "Book_Synopsis":Book_Synopsis,1 n8 V) X# q E# q3 `
- "Book_Palabras":Book_Palabras,
, x' j# {% L9 d - "Book_img":Book_img,
$ {9 T; \* K- q' ]5 x _/ y - "Bookurl": response.url,& U/ n/ |. ~7 A
- "Booktitle": Booktitle,) J, A! d, P% o0 Q. o
- "BookID": BookID,( |3 k# u/ z! G$ l+ M( J6 f
- "BookConte": BookConte,' D2 }% ~* Y) g
- "Titleid": Titleid,. q5 w9 Y! B8 B7 _4 U2 Z) {
- "abover":abover,
5 K' \- y3 Q( q' v) Q! W - # "Book_Date" = str(datetime.datetime.now()),
! c# h) P x2 ]9 `, N1 z D - }* h& H ~$ Z3 e# e7 k/ T1 S1 x/ I
- def download(self, P_dir, imgDir, file_name, Book_img):
* L, l% u8 Q4 ]" Y - if not os.path.exists(imgDir):
& z& r0 i; i' O- G8 L% Z - os.makedirs(imgDir): l9 y" Z* T" [: D& ?
- file = imgDir + "/" + file_name) x! h- y/ s- o' a
- # print file3 l4 _8 U& o- s: U
- f = open(file, 'wb+')5 d: L, R% }- l& v) f
- imag = requests.get(Book_img) 0 u" X) H6 q1 N6 z E- L+ Z6 b
- f.write(imag.content)4 i6 i" c; S& x7 c6 {! G- _+ P5 K! I
- f.close()
0 z& i/ D) O& i2 M. R - #保存图片前
( z1 ~8 O0 S' t" i# N - def save_imgs(self,response):/ v& |3 w& F) P: I# ~0 p' K# n( n1 C
- content = response.content
6 |8 }- A' C4 U9 R" I% U - file_name = response.save["file_name"]( D6 g. y# g; K! A
- imgDir = response.save["imgDir"]
) k% N5 j7 j( D6 c9 f2 p7 y - file_path = imgDir + file_name3 q* i& ]- f6 ]- d
- self.save_img(content,imgDir,file_path)$ X+ ?- ^; S3 s0 g$ E
- #保存图片% `. n. q& _1 z+ Z* t" S
- def save_img(self,content,imgDir,path):
! K7 X& ?2 p9 }! N# Q. t - if not os.path.exists(imgDir): 3 g; u# J9 K8 [, }
- os.makedirs(imgDir)
$ o& _; C' j( U) D6 i9 J - f = open(path,"wb" )
1 L- Q6 @* V) R" ?& ~4 ^% ~ - f.write(content)
6 q: J3 N' |- Q5 x4 c1 x% e - f.close()
$ y6 D% }) _5 C |6 o% H - #获取url后缀名$ I. H& A3 q1 E9 F
- def getExtension(self,url):
- e; S0 j7 c0 m6 |. d - extension = url.split(".")[-1]
( O: o/ X& b; S - return extension % @! [# A1 @, c' b
- {: E* {3 v; C' y5 p' i
- #获取图片名
3 i: K3 g! Y5 S9 ^, |, C. a6 E% s9 W - def getname(self,url):7 B- m& |5 g4 a. _; z
- name=url.split("/")[-1].split(".")[0]. ?& p: I. u, Q m0 S% H; y
- return name
复制代码
; [5 K: O% |9 B* _: p, R+ {. ^9 K + j( N) W0 U: I8 a) R0 V
|