Python + pyspider某小说站的爬虫,入数据库,火车头发布,资源下载到本地,另可写爬虫!
9 x8 K' L) ^6 N5 a3 _, Q- #!/usr/bin/env python4 X# y& T3 [ ~1 F ]$ r
- # -*- encoding: utf-8 -*-: b# }, ]1 j* W1 P( \% ?2 B" Y
- # Created on 2019-05-05 21:43:11- k5 J% B7 X/ N" |5 g
- # Project: XiaoShuo" c/ t) _; d) g* G* \0 L) s
- 7 b. W2 m7 N0 A* [4 R3 O# f
- from pyspider.libs.base_handler import *
H. D: {/ U: T% C# A4 ~ - import pymysql
; H: {( n/ l$ R4 q - import random
$ w+ }: x' q1 W) W, D# p! d - import datetime" R: w: G$ F4 X5 p; w; I
- import urllib2,HTMLParser,re
5 D( p' O. B( z& J - import os
; `5 P6 b( b$ m7 p - import sys7 Y3 M) U8 U7 }' V
- import re- c2 [$ P% ^6 H$ l( P
- import codecs
3 X, ~" ~. @- ~! y - import requests
) h" V0 p9 R6 q( e+ p - import json4 Y2 V8 n) l, w0 D0 [6 T
- 9 h2 K2 ?) G& e* R9 e: T$ p3 ?
- class Handler(BaseHandler):, i0 g% x% L# [9 _* l
- global Datos
; k- b, u4 W( f3 }8 y - global P_dir
( ], q; L1 U" T - P_dir = '/Tools/Debug/' #采集时候图片保持到本地的路径( b* b0 C. e& n+ o2 S1 T- L
- global Datos
- X) }, S; H% _+ R C - Datos = {}
+ L- I# j: w- N - headers= {! ]$ U# A) @/ z' \* Q
- 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
: M) v) S; j3 x- @ - 'Accept-Encoding':'gzip, deflate, sdch'," Y& y y5 }/ i# }
- 'Accept-Language':'zh-CN,zh;q=0.8',: m8 H6 u; V* C! Y/ U
- 'Cache-Control':'max-age=0',
2 w1 V$ P$ e- ?; Q9 J% d - 'Connection':'keep-alive',
O# K8 ^& z# G% o - 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'
# u! \/ }5 a, [1 y# x% E - }' _6 V) W% W0 E& d3 w- B- A) m0 v
- crawl_config = {
; e1 w* w2 ~* ?6 _/ e6 S. x \, `- v - 'headers' : headers,
7 t9 g0 |8 k. A9 y$ Z1 z4 g) s - 'timeout' : 300
/ O1 }3 B: S, l) {. B( @$ t2 o \+ [ - }
3 G9 l$ V/ {; ~. j; f/ D( t O/ o - def add_BookFile(self,Bookname, BookIDs, img, Locaimg, Book_Dates):( {5 O* a+ o5 y$ q* s- y
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
. D2 ^, |% H Z5 L5 g3 A# p( | - try:
0 u9 h" {0 \8 d7 o) J+ J1 d - cursor = db.cursor()9 `7 X) I/ f0 i( {7 v5 @* _/ A
- #注意此处字符串的占位符要加双引号"%s"5 X- l# M0 ]/ w* C8 w5 S- ]. t
- sql = 'insert into BookFile(Bookname, BookID, img, Locaimg, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, BookIDs, img, Locaimg, Book_Dates);5 W5 ^: y8 @, H1 k& O( D' a: q4 C
- # print(sql)
: g; F' }8 U+ V7 B) ~) l% g - cursor.execute(sql)
4 V& W6 j: e. Q M0 P - 5 `4 g7 U7 m3 T0 X6 x( @! a
- #qid = cursor.lastrowid0 l' b6 u2 [5 f6 ~# w& y# g) p
- #print(qid)
5 ]" P* F! o0 E: ?% S! Q - 4 k, [1 P. \( y5 U
- db.commit(), a9 K0 E9 s' T. D' L; X0 l! @7 H
- except Exception as err:5 d+ {. s/ x( b! Y
- print("Error %s for execute sql: %s" % (err, sql))4 B6 F' t$ J3 v* @
- db.rollback()
: a( `$ w9 R2 ` - def add_comment(self,Bookname, Booktitle, BookID, Titleid, Book_Date):
# t# j7 `' a% U, p6 H( Z - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")3 i/ r) \/ B$ V! V9 \5 v. q& i9 Z
- try:
. E* g/ g6 e3 v3 O) U; p9 L* z - cursor = db.cursor()* ~7 m2 g8 K+ d/ J4 ^/ T
- #注意此处字符串的占位符要加双引号"%s"
+ Y8 t9 B8 _( r! Q. k3 b - sql = 'insert into BookTitle(Bookname, Booktitle, BookID, Titleid, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, Booktitle, BookID, Titleid, Book_Date);& b4 f) g0 Z+ F0 } p* F' O, _: x
- # print(sql)
2 r1 o& j& k8 o T3 z' m - cursor.execute(sql)
, j2 [ ?2 x; K+ |. P& ]) m - 8 D) E+ N1 T; W" i) d2 B+ t# Y
- #qid = cursor.lastrowid
; Q4 k2 ^* i: V# k( ^ - #print(qid)
( H, }6 W4 E% E3 A -
; r. w- x5 C) S& s1 O - db.commit()
. x$ y; \1 c7 L% l( U - except Exception as err:0 Q( P* c" b4 Z6 K! Q, n5 l
- print("Error %s for execute sql: %s" % (err, sql))
8 ~/ {4 w7 m! v$ { - db.rollback()+ k% \; ^! [+ c V
- def add_question(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date):* j9 Z" V) g p' L# R+ G
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
' R1 V0 C+ r" V) z- J) H' D - try:$ V4 M" l4 y0 U( }0 M
- cursor = db.cursor()
* z8 ^" ]& m3 z9 l - #注意此处字符串的占位符要加双引号"%s"% x+ q0 l9 I# h5 [! i: }
- sql = ' INSERT INTO BookConte (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date);
6 i" y( w& H s+ S4 D+ [; O3 ?% S - print(sql)
3 K( w$ o/ a. x K& [ - cursor.execute(sql)
6 g8 r; a5 T/ l! y1 S, {/ R( K - print(cursor.lastrowid)
5 w2 F) d; j6 e0 V1 W4 `: e7 g; c - db.commit()
4 k! P) D* r3 t0 G( G( y4 G - except Exception as err:
1 q. J! N! b; ~6 o- S5 |8 C - # except:
. b8 G6 M1 ]! Y& b- l/ R1 u - # print('Failed')
6 U# x- g( e0 ^: T" J - print("Error %s for execute sql: %s" % (err, sql))) Q% S, |* O" A
- db.rollback() J' h: q- b1 \) b+ j
- & j& v* V. }3 v( H& S: p8 C& I
- def add_locoy(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover): # T3 n2 Q& W8 t# H$ `& D1 }
- reload(sys)) M0 P: B2 }0 o# L
- sys.setdefaultencoding("gbk")) d/ D2 }/ E/ j, F# e
- locoy_url = 'http://www.******.net/locoy/?my=book' #697火车头发接口地址
9 G3 O2 x% k+ o2 h - locoy_data = {/ T6 ^+ [0 n2 `9 J& E
- 'my_u':'用户名', #后台用户名
0 \1 L/ A1 i& n5 v) Z5 |3 A5 { - 'my_p':'密码', #后台密码
: d/ W) D6 i* k) X - 'subject_669977_net':Bookname.encode('gbk', 'ignore'),
# K# z' @, f% P8 m' H - 'caid':Cater_Name.encode('gbk', 'ignore'),
) n2 _! D% S# y* K1 ] - 'title_669977_net':Booktitle.encode('gbk', 'ignore'),
m& B0 t8 {. r w, z4 I9 ` - 'article':BookConte.encode('gbk', 'ignore'),* E$ X& u# L4 }+ s( N7 ^
- 'author':Book_author.encode('gbk', 'ignore'),0 Y" o, I% t8 K7 p0 v+ f; `
- 'ready_1':Book_Palabras.encode('gbk', 'ignore'),
$ W; k* N: s* y* T" z* L - 'thumb':Book_img,$ s8 n% n! i$ ?; `4 V
- 'content':Book_Introduction.encode('gbk', 'ignore'),
% {0 b( p3 H/ _) A' d% o( n - 'abover':abover.encode('gbk', 'ignore')
) B' @- o9 q9 w0 Z- b - }
9 i+ i9 Z/ m0 k) n: M2 a - res = requests.post(locoy_url, data=locoy_data); c( ` m5 d' u4 u
- print res.text
$ ^* e% Z$ c$ W2 G1 s - print res.content
2 j1 P+ \ h1 E2 Y e* [ - # print Dsd0 X" Z5 L5 S# v) ^9 |
- return res
* ]) ]; D1 Z/ b+ I: [9 f. a - . h- U& [. ^( ^, e6 Q
- def __init__(self):2 R4 l* `0 \! a0 c! b! Q$ x/ T
- self.base_url1 = 'https://www.****.cc/'
2 r8 V0 E6 j0 M W% D - self.base_url2 = '/'
2 p9 D3 N- t1 s+ f - self.CaterId = []. H2 z( M; P) o3 d( y) w9 x
- self.CaterIds = ['xuanhuan', 'wuxia', 'yanqing', 'lishi', 'kehuan', 'kongbu', 'nvsheng']
7 s- u* N( p% H - self.page_num = 1
& X/ G8 [6 \7 b - self.total_num = 200
% u5 }; n# z) Q# h8 d8 n" F+ v - 8 y+ u* ]& { K. `
- @every(minutes=8 * 60)
3 E1 F+ ]2 A0 d. K- J - def on_start(self):
% a/ r$ i- [( O0 V% k0 z* c1 ^ - global Cater_Name
; ]* d" t; v: x/ t. z: m+ c - Cater_Name = []8 S X% Y8 \' V3 |/ M% o
- while self.page_num <= self.total_num:
4 K, k# n& o: i8 T- m - for self.CaterId in self.CaterIds:
0 r& b- K2 O' L6 H( V# Q - if self.CaterId == 'xuanhuan':- f/ v& j1 M0 D7 @/ \" D* C8 R3 c
- Cater_Name = '玄幻'+ x u3 b6 M+ ~6 i) _
- if self.CaterId == 'wuxia':$ \; {. [8 f( O G+ V0 R' b, E8 W
- Cater_Name = '武侠'; N5 W( [' z: |( W& ?8 {% C
- if self.CaterId == 'lishi':
' C& E+ s7 S8 a3 k( F1 n - Cater_Name = '历史'
6 n" F2 `# s5 G! ~6 Y/ A - if self.CaterId == 'yanqing':
, P; J3 ^8 H' w( Z* l - Cater_Name = '都市'
4 k' V* c6 Z X1 T3 b3 q6 I# B% u - if self.CaterId == 'nvsheng':
& c. `4 D P/ h* A1 U5 U - Cater_Name = '都市' & ~ U9 X0 c* @4 @ v- F. D
- if self.CaterId == 'kehuan':
+ I0 [* O1 D/ O - Cater_Name = '科幻' * v- u$ c3 U/ T$ y: g& W' f- V
- if self.CaterId == 'kongbu':, {# Q( V% Y8 \4 l
- Cater_Name = '游戏'
# i; T1 B' Z9 F. r$ G, L* a& T - print self.CaterId. D1 T$ |4 g8 @! y
- url = self.base_url1 + str(self.CaterId) + self.base_url2 + str(self.page_num) + "/"
( ?1 m/ d' d5 i# e' F; @3 F( [/ g - self.crawl(url, callback=self.list_Caterg,save=Cater_Name)% ]3 R. _% n0 l Z
- self.page_num += 1 + _3 R5 h- d- [
-
7 a% l' C P* ~' [6 f. v! | - def list_Caterg(self, response):2 z. O; i# G6 T# B& m8 [
- Cater_Name = response.save
& u7 _% K# a/ B1 Q: c" t - for each in response.doc('.pic-list a[href^="http"]').items():1 _( l% e1 m& j4 [
- self.crawl(each.attr.href, callback=self.list_Caterg_detail,save=Cater_Name)
# I5 S# y1 [% B0 o - ) b# R0 g7 X- T9 K; N- H
- def list_Caterg_detail(self, response):- i, O+ Q' H$ i5 R6 u O# d
- Cater_Name = response.save7 k/ [& w; j6 ~) n* i
- # print Cater_Name1 E% z9 P; R" ~" { |
- Bookname = response.doc('h1').text()3 w* K: f7 y& u3 V! H: R- Q& e
- print Bookname
/ t7 v8 y; ?- ]/ `: z- K - Book_author = response.doc('.authorname > a').text()
3 T7 Q( O6 V$ ]' I - # print Book_author. D. P) @2 }# U' r
- Book_Introduction = response.doc('.book-intro > div').text()
' c& s8 n) F O( p5 E* ]+ | [3 W - # print Book_Introduction
( N }8 [5 j5 H# }4 y9 m8 G) Y" C+ o - Book_Synopsis = response.doc('b').eq(1).text()
n' l% W/ [* _7 u" H - # print Book_Synopsis7 V$ Y5 [# l9 H* x r
- Book_Palabras = response.doc('.booktitle p').text().split(' ')[1].split('|')[0]! W- V' n1 I1 ^
- # print Book_Palabras
8 l& }. h7 Z0 j - BookIDs = response.url.split("xiaoshuo/")[-1].split("/")[0] #小说ID
, V0 j: N+ e3 J0 @! J7 }- B8 t - # print BookIDs
; f- @" y, ^# P5 ] - Book_Dates = str(datetime.datetime.now()) # U4 Q, U4 z8 q' K
- for imgs in response.doc('.bigpic > img[src^="http"]').items():
/ F0 I( j: c* P+ Q# _$ A Z - img = imgs.attr.src
' [! x0 {& F/ z5 b. k/ h# t - print img
- b4 \4 G1 ]* [; U - #小说封面下载' w/ {7 n: g g9 j R. g$ u. _
- extension = self.getExtension(img): j! s# E, E @$ D" o
- name = self.getname(img)+ F- O: D! a# b! T p, g
- file_name = name + "." + extension
% I! O, D& Y$ f1 C0 {( J: h [ - imgDir = P_dir + name
$ j2 n8 x& N' m% _ } - Locaimg = imgDir + "/" + file_name
) w: g- _- y) |. P( w' C - print Locaimg
% S, U [3 \( ]! \ - if(self.download(P_dir, imgDir, file_name, img)): #这2行可注译,图片下载到本地
/ `0 n* p( V8 z r% y6 R" v - print('attachment url is ' + img) #
y2 `4 d7 c' A. ^ - Datos = {
% f9 o- n1 G2 o6 E/ T5 S& u - "Cater_Name":Cater_Name,9 F: |$ Z8 c4 A" O
- "Book_author":Book_author,
4 C9 C+ o7 ^, ^" ? - "Book_Introduction":Book_Introduction,
. g" G* l5 ` q4 y - "Book_Synopsis":Book_Synopsis,
' f6 q; c2 N- Z* E; M: R0 J - "Book_Palabras":Book_Palabras,; W$ I3 R+ F: d& f `4 K! D
- "img":img,/ P& ?' a1 R1 m& d# a+ C
- }0 W) m: h1 ~$ h. K* O
- self.add_BookFile(Bookname, BookIDs, img, Locaimg, Book_Dates) #这行可注译,数据库发布接口,方便其他系统的发布& q2 d1 k3 Y k* i. _
- for each in response.doc('div[class="bookbtn-txt"] a[class="catalogbtn"]').items():7 B: O5 { K% `6 V3 f# N
- self.crawl(each.attr.href, callback=self.index_page,save=Datos)
2 A8 |/ Z/ F E6 T7 p! l9 z - ( n7 R g3 \6 a2 [
- @config(age=8 * 60 * 60) - U3 T5 g2 b1 b1 d* V8 ~1 u
- def index_page(self, response):
6 a, u# w9 o, o$ Y - Datos = {
/ I5 B7 a! } [+ x5 Z - "Cater_Name":response.save['Cater_Name'],9 ?; Q! \" W4 k' D: K5 P
- "Book_author":response.save['Book_author'],
: S8 L6 ^! O! p' J5 m4 t9 G - "Book_Introduction":response.save['Book_Introduction'],/ r% s K# W9 ^( P& i
- "Book_Synopsis":response.save['Book_Synopsis'],
+ f8 Q+ w8 {, e, e5 k - "Book_Palabras":response.save['Book_Palabras'],. b6 X R8 O2 K
- "img":response.save['img'],
! E/ b3 R! U3 b. s) g - }
7 Y& @# P' x* ]1 Z2 M" J - for each in response.doc('.chapter-list li:first-child a[href^="http"]').items():
9 B: L$ |- k/ F+ J; |4 v - # for each in response.doc('.chapter-list a[href^="http"]').items():
; P) I$ \: ~* L7 {/ {6 U - self.crawl(each.attr.href, callback=self.detail_page,save=Datos)0 z: e* L% Q" A0 \6 ?9 B
- @config(priority=2)
7 W. o7 s8 _9 A# O$ Q/ z; n. H9 D - @catch_status_code_error2 o; P! \) h$ x$ { ?
- def detail_page(self, response): * c2 U6 N7 I% U- Y
- NewRe1 = u'哈书'8 g* j+ E0 S+ y6 L( G+ Z
- NewRe2 = u'huhjsd.CC'
$ U* q; o2 ~5 y$ ?7 ~8 c - NewRe3 = r'^\\n\\n'
5 u- L# W* a7 W) s. ~' X; a+ a - NewRe5 = u'小说网'7 D. L6 M0 m: O7 R1 ~$ U: q- l
- NewRe6 = u'fgdfgf'
/ `. Y% F, I2 L - NewRe7 = u'fgfgf'1 G, A$ r; O; L2 G7 x
- NewRe8 = u'ffhgf'( l: o- \6 j5 x- B. L8 T: Y
- NewRe4 = r'[\f\t\v+\.\{\(\)\}\!\/_,$%^*(+"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+'
9 r2 j" B6 M# X - ReC1 = u'静思'
! X6 z: v, @' n3 B) X - ReC2 = u'aghgf.com'
* B! f8 O- ~; s - ReC3 = u'aghgfh.com'
3 x( ?5 i6 y( J' A/ i+ R - ReC4 = u''
( ^4 A0 j8 W6 i, P - ReC5 = u'文学网'
1 c9 j& \, V v( B) d$ f8 g* r - ReC6 = r'<BR>'( T3 a$ z' G' N2 m1 I3 r4 e
- Bookname = response.doc('.readlocation a').eq(2).text() #小说名称
5 o) Z4 E3 l$ _: ?. P- x& a - print Bookname g; w" f- q) |* Z6 z1 n: p
- Cater_Name = response.save['Cater_Name'] # 小说分类
% G8 o E9 R9 s; {9 d; I. A - Book_author = response.save['Book_author'] #小说作者
; \" l0 C" a! ]4 H/ Z3 c/ M& B - Book_Introduction1 = response.save['Book_Introduction'] #小说简介
k. d; @% x; v' v4 } - Book_Synopsis = response.save['Book_Synopsis'] #最近更新 s+ e) }8 f6 R/ q8 C/ a2 P
- Book_Palabras = response.save['Book_Palabras'] #小说字数; p! ]$ o) c+ z' B$ ]- @5 D2 c
- Bookurl = response.url #小说网址0 C; f" A( g0 @9 A5 W( @5 I- U) _
- Booktitle = response.doc('.article-title').text() #章节名称# i1 m5 t" |2 r0 m
- BookID = response.doc('.readset-r span').text() #小说ID
: v$ |1 g( c! W; H7 r* ]" v - BookConte1 = response.doc('.article-con').text() #小说章节内容+ f2 p' i# ]. A, ~8 o7 k$ Q& \
- abover = response.doc('.article-title').text() + response.save['Book_Synopsis'] + response.save['Book_Palabras'] + response.save['Book_Introduction'] #小说状态(连载还是完成)) h' s3 G9 Z7 N, j
- Book_Date = str(datetime.datetime.now()) # 采集时间
. i9 O D0 `1 s, z - BookConte2 = BookConte1.replace(NewRe1 , ReC1)
2 { ^' D6 |$ @# A - BookConte3 = BookConte2.replace(NewRe2 , ReC2)
( m. F$ N$ M$ b6 d) L1 a2 n( x - BookConte5 = BookConte3.replace(NewRe5 , ReC5)
0 C2 g9 ^8 H4 q! X - BookConte6 = BookConte5.replace(NewRe6 , ReC2)! P% {* A2 }7 B) y4 W- b
- BookConte7 = BookConte6.replace(NewRe7 , ReC2)
' ]: O+ {$ q; W: ?' P3 B2 M+ J - BookConte8 = BookConte7.replace(NewRe3 , ReC6)
) P- e; X+ n {! X5 }. S - BookConte4 = re.sub(NewRe4 , ReC4 , BookConte8)! n1 P; U3 R D! Q* F
- BookConte = BookConte4.replace("\n\n","<br>")
: [- A. B! f* q4 c# l - print BookConte1 t$ Y% a0 Q& w! H4 C* U7 u, T
- Book_Introduction2 = Book_Introduction1.replace(NewRe1 , ReC1)
$ h* R$ |7 |. m3 j* w8 ]# k t - Book_Introduction3 = Book_Introduction2.replace(NewRe2 , ReC2)
" m- K; o$ k8 M% G. a - Book_Introduction4 = Book_Introduction3.replace(NewRe3 , ReC3)/ c: J" U4 p" R' A/ j" S( H7 W- x
- Book_Introduction = re.sub(NewRe4 , ReC4 , Book_Introduction4)
; Q; ]" V/ R" T5 D - Titleid = response.url.split(BookID + "/")[-1].split("/")[0] 8 y0 ^# A- E3 |$ K
- Book_img = response.save['img'], #小说图片 Q! G! M: S9 _7 v$ q8 m
- - D& [6 `! B# C7 x$ a
- #insert into MySQL 小说入库9 {& w; @$ R; n# p1 r; c$ O
- self.add_question(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布
# X. I" v& G1 I+ L - self.add_comment(Bookname, Booktitle, BookID, Titleid, Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布- ^' e8 `) p* E
- #post提交发布
5 k: a9 M$ a6 t9 ~ - self.add_locoy(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover) #这行可注译,火车头发布接口,不需要可取消
' g( t: X. }( g L6 P; Z# F( o; Y - Datos = {
c+ o# p6 W0 J. O; J - "Cater_Name":response.save['Cater_Name'],% S1 ^1 k3 `$ k, m1 h# M8 _: B
- "Book_author":response.save['Book_author'],
; |; Z M t% a% G& B' G4 d2 Z- H - "Book_Introduction":response.save['Book_Introduction'],
0 l! C3 a' L' D! z4 t, L5 { - "Book_Synopsis":response.save['Book_Synopsis'],, o* K2 i7 P2 V* y J u
- "Book_Palabras":response.save['Book_Palabras'],9 v* ?" c/ H6 P( K+ k
- "img":response.save['img'],
4 r3 h/ p0 Z3 G# z - }
4 o/ I, n8 S6 i: c% ?3 o( E/ R - for each in response.doc('.articlebtn > a:nth-child(4)[href*="/xiaoshuo"]').items():4 C6 X, V9 z4 h+ X- @; q5 ^ o
- self.crawl(each.attr.href, callback=self.detail_page,save=Datos)
( M7 g- q7 R$ x) u* W - return {
Z4 N; }# I/ x7 f! e2 C - "Cater_Name":Cater_Name,
' W+ \, K) f. f- q& v - "Bookname":Bookname,
1 \1 O" U/ [. c( k3 v: T - "Book_author":Book_author,. V" t7 r9 i9 I
- "Book_Introduction":Book_Introduction,
' X) ]5 X; s: \. L' F, U! @ - "Book_Synopsis":Book_Synopsis,
+ x. U" W: D# |: i - "Book_Palabras":Book_Palabras,
5 e5 [7 X6 n- t" B3 l5 ~ - "Book_img":Book_img,
) h: ]5 V K0 D7 q3 v# |* W: x - "Bookurl": response.url,
1 N3 k: s; x8 m& I4 U: k+ ? - "Booktitle": Booktitle,
" K) w7 |# j8 K. \6 m" q - "BookID": BookID,
M5 E9 E3 c8 C; p f$ N! q - "BookConte": BookConte,/ B5 ]. T$ m& B! j
- "Titleid": Titleid,
5 b; M3 |' j4 U; P$ a' B - "abover":abover,; Y, K8 ]. B( G
- # "Book_Date" = str(datetime.datetime.now()),0 U+ H# ?5 V% |2 @4 C: O
- }% q4 u' L( F1 N5 C
- def download(self, P_dir, imgDir, file_name, Book_img):
3 n$ j8 R1 g C - if not os.path.exists(imgDir):
' `( t- Y+ @5 r. J |7 ?4 \ - os.makedirs(imgDir)# J/ T% B; m5 m2 R
- file = imgDir + "/" + file_name
' i; Z: E4 t- z5 `/ v3 r - # print file/ j; |; o- a) ?' U! r
- f = open(file, 'wb+')- m3 h. h j& x K& {( Q% T' w2 f
- imag = requests.get(Book_img) ! c: @5 ]' ?# s% s" `
- f.write(imag.content)# e* q' Z! X+ f8 R, [* n
- f.close()& c7 ]) f ]" u6 m I, Z. d
- #保存图片前
: y& a$ E6 P, `* t$ ~2 d2 A1 z - def save_imgs(self,response):/ V$ N8 d) i( t* b4 ^
- content = response.content
n* n0 Z9 G' F. e) K - file_name = response.save["file_name"]5 r4 |" \2 C* Q; t
- imgDir = response.save["imgDir"]' z, D9 S# q/ I) W* i5 T3 }
- file_path = imgDir + file_name
' [" l1 {/ _9 P* B% b+ ` - self.save_img(content,imgDir,file_path)
w0 C$ @) ~' ]; Y$ I* b3 F - #保存图片
) }+ D! H1 i! h8 c - def save_img(self,content,imgDir,path):& s5 U! r' x0 K Z$ o
- if not os.path.exists(imgDir):
0 [ n: w# I1 D2 D - os.makedirs(imgDir)
. O: T/ o' F5 M# H7 ^8 [ - f = open(path,"wb" )6 R4 C/ ^7 i. k' z1 ~/ F; `
- f.write(content)
" q2 d$ B* t* ]0 M3 T) L+ P - f.close()
/ \" J0 G b/ w) ^7 [ - #获取url后缀名
& Z: Y9 C3 H3 L1 B+ m8 f - def getExtension(self,url):
0 Y5 D+ F @. n6 O; X( K - extension = url.split(".")[-1]
2 B2 K) c2 r2 d" K3 w7 o2 ]8 I- _ - return extension . Q. S+ t( G: j7 P4 {" f0 `1 Y
-
) |8 s# U/ Z9 S8 R& q - #获取图片名/ L# k: O' p; u; J7 u
- def getname(self,url):
s) x" w" G: G7 T% i - name=url.split("/")[-1].split(".")[0]# J. e6 ?% y, M- W
- return name
复制代码
, @; s, L# M A3 A, E! B; ~, T & S6 k& \8 }( E7 S4 v. `9 D
|