Python + pyspider某小说站的爬虫,入数据库,火车头发布,资源下载到本地,另可写爬虫!
* ~3 z& |* X: A, z- #!/usr/bin/env python2 J4 o: ^; ~! N5 r# |7 G& [; `
- # -*- encoding: utf-8 -*-" u" n0 G3 n1 C3 W \, X) p
- # Created on 2019-05-05 21:43:11
& L% T/ \4 w6 l, h - # Project: XiaoShuo; o }" W# a+ [$ ^2 }4 l# |5 T. L% p
- # @+ {' Q8 G o* y7 v: |+ w+ M$ A1 y
- from pyspider.libs.base_handler import *
3 o1 Y5 n( B/ f - import pymysql
! Y/ [9 R d5 j5 Y3 \( h - import random
5 F3 s2 R* i# R& V - import datetime* F$ x$ G4 v9 K: o
- import urllib2,HTMLParser,re
$ ?9 \( o6 ?7 D( y# T" I - import os; J9 B- O& N) l+ N+ L! R
- import sys
4 _' E f( Q" \# d - import re' x. {: w* R$ d. K r0 N2 F
- import codecs
8 r1 B0 x$ @1 @3 [. A; w$ I3 C" ~ - import requests5 j) g' ~+ Q: _9 t0 W
- import json
( a8 l) F, n2 ]0 t - 0 p) x& \3 O# [
- class Handler(BaseHandler):
. T; P* l2 |7 d) x Y - global Datos5 G5 P; x" I% P& r. U6 C% r
- global P_dir
8 Y I3 u3 C2 \1 Y+ j - P_dir = '/Tools/Debug/' #采集时候图片保持到本地的路径
- B- L; N+ e. M. M9 I7 | - global Datos
8 [ u( o6 P! s: I) r3 p - Datos = {}
- B( Q0 j- I k - headers= {0 H g: Z. u) U& a/ k& y+ I$ f
- 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
/ ^/ ?+ C9 k5 n, u) S( Y) w - 'Accept-Encoding':'gzip, deflate, sdch',
' @3 B4 T# V8 q8 O: z& N - 'Accept-Language':'zh-CN,zh;q=0.8',
# A1 v9 K. y; \1 P8 ~! l - 'Cache-Control':'max-age=0',
, H" V6 F! |! j9 N6 F9 ]2 F - 'Connection':'keep-alive',
8 r6 e5 v2 \( E& v - 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'2 R1 M2 X3 @; m: n& i
- }
. j% x" ^" i' {, d2 l - crawl_config = {. U3 x8 b' r. H. h, f
- 'headers' : headers,
6 |6 I/ s5 v9 D. v - 'timeout' : 300, N6 @4 |* u, O( S% B0 i) Z& d( J
- }& `2 N' h" u( a
- def add_BookFile(self,Bookname, BookIDs, img, Locaimg, Book_Dates):
4 J) X' V( O% x9 @% P: q - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
0 X) u% ~* c% t( R/ s - try:7 v5 @" A N, `. K; d5 y
- cursor = db.cursor()
' l0 n0 I8 V: F - #注意此处字符串的占位符要加双引号"%s"
2 R3 w* A4 M3 @( Q% S - sql = 'insert into BookFile(Bookname, BookID, img, Locaimg, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, BookIDs, img, Locaimg, Book_Dates);" q5 h( T9 L2 G' B
- # print(sql)6 F# o# y5 ^6 ]& x6 q6 v3 l, d h0 @
- cursor.execute(sql)
: P: s% v! @6 L% T& o" ?# B -
! L4 M+ M0 K B* _7 r6 D4 g - #qid = cursor.lastrowid% h0 D8 t! p* j7 E
- #print(qid)
0 R* k+ I% F2 e% n -
& B1 \0 ]& I* |6 v6 d% h* Z - db.commit(); e6 t; x" D/ [0 ~% ~
- except Exception as err:
' J7 V9 y$ M/ c L: }* g8 c2 s - print("Error %s for execute sql: %s" % (err, sql))
% s3 e+ ]2 g0 F( J+ Q) U$ A - db.rollback()2 k7 \; m; ?% k! i$ K
- def add_comment(self,Bookname, Booktitle, BookID, Titleid, Book_Date):$ }& F8 o# b; u B
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")' K* T- F! }8 I- c; C9 \! Q# `% j
- try:
" D) B1 t8 [% m5 X0 L - cursor = db.cursor()
& @2 [, Q) E1 Z, y; W8 [ - #注意此处字符串的占位符要加双引号"%s"' B; P4 w6 L$ o! e# E# W' F7 C$ r
- sql = 'insert into BookTitle(Bookname, Booktitle, BookID, Titleid, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, Booktitle, BookID, Titleid, Book_Date);: ^; d& ?3 t4 r* t& u# t# B4 s
- # print(sql)( o& `* g3 ?+ \( U- B/ ?' S$ M
- cursor.execute(sql)
1 q: _ l8 F/ |% i0 e% f - 8 c$ w4 {* Y) U* ?
- #qid = cursor.lastrowid6 m w0 w7 K" D
- #print(qid)
9 N% h2 x/ n4 B8 Q, I - 0 |& n( H- v) g- t7 f) n7 {; j
- db.commit(), n( |) a5 |1 z4 b
- except Exception as err:
5 h" L* P+ t& L7 O* S8 Z% u+ B6 { - print("Error %s for execute sql: %s" % (err, sql))
- }- I! R: \8 L( S - db.rollback()
i- Q; _: G/ ^! `# M& F" a - def add_question(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date):: v! F8 a& W$ D5 e m! P% w* _
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
$ y4 X5 {( u' c - try:& M3 N& H2 z4 Z, d( A2 D
- cursor = db.cursor()
( V4 C6 i2 x" K7 x - #注意此处字符串的占位符要加双引号"%s"
1 R. ^! |6 b4 ]& \9 m+ F7 z - sql = ' INSERT INTO BookConte (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date);$ e9 y. X; N; ]! `! u, f L
- print(sql)
8 u! d6 ]- ^5 j$ C. c$ E% } - cursor.execute(sql)9 w6 J: H$ R: P# J8 D
- print(cursor.lastrowid)
O: L' Y) D4 E# e! u - db.commit()/ h* y1 ?# L% @9 t7 X
- except Exception as err:- N3 p8 m3 Y. P' m& ]
- # except:
1 n6 n9 h9 Y" D - # print('Failed')
+ M5 b7 X; m1 S% y6 }9 V d - print("Error %s for execute sql: %s" % (err, sql))& I0 N( H, q, Q4 l4 G$ u& @
- db.rollback()! U+ E5 e3 X7 p5 O; z+ S
-
$ D: F: P3 [8 {3 c) N - def add_locoy(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover):
; n( H! t7 o1 n8 [( l( }2 K3 R - reload(sys): N% r, n3 G. \" {( ^) W) r ^* J
- sys.setdefaultencoding("gbk")
7 f3 B! j" I# f* B- n - locoy_url = 'http://www.******.net/locoy/?my=book' #697火车头发接口地址
8 s+ [3 {4 r' p3 H& s/ n. C - locoy_data = {& L8 ?1 g( q }) N y' v d
- 'my_u':'用户名', #后台用户名$ k$ T" `8 F( I2 g- [# H( M
- 'my_p':'密码', #后台密码
_5 S" i$ f+ B+ M - 'subject_669977_net':Bookname.encode('gbk', 'ignore'),
9 F" Z' z" t( i' t3 ?/ U8 U - 'caid':Cater_Name.encode('gbk', 'ignore'),
: i3 Z; E# ?" A6 {/ {0 _! [ - 'title_669977_net':Booktitle.encode('gbk', 'ignore'),% b! g; p- S8 f- O \! g
- 'article':BookConte.encode('gbk', 'ignore'),
% m" o# {" V7 y. ` - 'author':Book_author.encode('gbk', 'ignore'),# E, K" D9 a5 W6 ^4 m0 d" n3 U' s( G$ K
- 'ready_1':Book_Palabras.encode('gbk', 'ignore'),
7 j9 d! |/ O Y j& M$ u - 'thumb':Book_img,* n" R8 k7 U7 R8 b8 Z. d
- 'content':Book_Introduction.encode('gbk', 'ignore'),7 C: {4 p" c2 u _
- 'abover':abover.encode('gbk', 'ignore')
, R) g9 }7 T2 q) H8 v+ C z2 b0 V - }
0 e! z0 U9 d6 A/ m. j6 E. \$ q - res = requests.post(locoy_url, data=locoy_data)
7 ~! _( Y4 J1 Q1 e2 ~ - print res.text7 {/ U6 }; m1 L, t8 W! I, K
- print res.content5 Z7 l; s' F- y# p% R" |5 ]
- # print Dsd
l. k! {+ P, F. A8 Q5 S - return res! o" L9 m+ e: ?, B. i9 S- U4 i; ]- S3 i- N
- 2 y3 T1 K/ d" `+ v
- def __init__(self):8 i' i3 n( K- G+ M8 W
- self.base_url1 = 'https://www.****.cc/'
+ h, g% ?* A) g - self.base_url2 = '/'# |1 v6 H& e5 S: p3 ?6 m9 Y2 h( w9 t
- self.CaterId = []
U ?8 W5 ~" P: k9 s: l. I - self.CaterIds = ['xuanhuan', 'wuxia', 'yanqing', 'lishi', 'kehuan', 'kongbu', 'nvsheng']- }3 n$ c& H+ I- T+ O4 A$ a9 f
- self.page_num = 1) ^& u( ^- N. s1 | E
- self.total_num = 200 7 @" z4 c$ ?/ L" Q% d) Y4 W
- , {$ A: t/ u h0 t& _4 R8 v
- @every(minutes=8 * 60)
/ J4 h) m% y* l9 `; y3 u' D& V - def on_start(self):. f, J* ?5 C5 u- R0 Q: {4 F9 g
- global Cater_Name
1 C! Y; |& P, w. }/ [# x( h - Cater_Name = []
7 w- m2 k& M5 l( E s; _ - while self.page_num <= self.total_num:
6 G" z% N2 L0 ^3 h2 \ - for self.CaterId in self.CaterIds:
3 v3 H% H4 k8 j4 N4 K - if self.CaterId == 'xuanhuan':
C$ S2 z6 J9 O% I( y, n - Cater_Name = '玄幻', h# { T* w- J9 ?1 d- T" r
- if self.CaterId == 'wuxia':6 w% ]; l& R8 W( a6 t
- Cater_Name = '武侠'. A' f9 w6 ]+ M, H) q ^% b F
- if self.CaterId == 'lishi':/ \$ F5 g$ Y I2 z
- Cater_Name = '历史'
1 T+ A- V$ C; h- E! M6 v - if self.CaterId == 'yanqing':6 Q% `; a5 Q3 i+ B+ o% I) E" H+ d+ L
- Cater_Name = '都市'
! p/ X& Q6 L* I. P+ l - if self.CaterId == 'nvsheng':
9 V6 a5 {# ~/ c, i - Cater_Name = '都市' 3 B" R2 F" A1 f8 k- ?: P T+ T
- if self.CaterId == 'kehuan':
* X/ f1 b, K$ A; P S K4 d4 C - Cater_Name = '科幻' " }& i R" g% ~* r
- if self.CaterId == 'kongbu':$ U V) l! A: f% h0 H; T
- Cater_Name = '游戏'
4 d1 v* \7 F7 B' S( r1 E' E( _ - print self.CaterId
0 N2 Y0 ^( D2 [* \" z* p - url = self.base_url1 + str(self.CaterId) + self.base_url2 + str(self.page_num) + "/" # @, P) H8 y, h! z p# Q. k6 u
- self.crawl(url, callback=self.list_Caterg,save=Cater_Name)
9 {& D6 n, y# S3 J$ ` - self.page_num += 1 % x% D* `6 X0 I# \
-
# @* \, U0 B/ u' y8 S& L - def list_Caterg(self, response):
+ S; ]' x9 S( b9 J+ f - Cater_Name = response.save+ ]* |/ b) w- r5 ?( L5 D5 ~$ X
- for each in response.doc('.pic-list a[href^="http"]').items():
L8 N, C3 \3 S - self.crawl(each.attr.href, callback=self.list_Caterg_detail,save=Cater_Name)9 V( x) d+ i) |! T
- 5 c5 c8 p" F' k- _
- def list_Caterg_detail(self, response):+ H. S4 ^/ g2 }5 _4 a
- Cater_Name = response.save
4 j# l% Q& a- ?0 Y0 k l; q* ] - # print Cater_Name, D+ x; G( m: N! u5 ?
- Bookname = response.doc('h1').text()# L0 R4 p1 e$ l9 O+ o5 H
- print Bookname( O# B( d! o" L( _1 `
- Book_author = response.doc('.authorname > a').text(), u( B1 A% V# r1 X. {5 ~3 d
- # print Book_author
* [" e4 u% Q# Y/ ? - Book_Introduction = response.doc('.book-intro > div').text()* K V! h5 y, e. d: h, h
- # print Book_Introduction
8 O$ E4 I6 d1 g9 z - Book_Synopsis = response.doc('b').eq(1).text()' W y+ Y: [, ?, g1 R
- # print Book_Synopsis" T1 Y6 b2 L X6 d; k+ w5 i1 v X
- Book_Palabras = response.doc('.booktitle p').text().split(' ')[1].split('|')[0]/ X1 \# ?8 A" n
- # print Book_Palabras
/ j& i; C {% H6 |8 v& x w - BookIDs = response.url.split("xiaoshuo/")[-1].split("/")[0] #小说ID) b4 `( z) f& F. W3 ?! O2 F. r
- # print BookIDs
1 K2 `& Z, B @4 L* a - Book_Dates = str(datetime.datetime.now()) u! Y2 C1 {% g7 {$ X
- for imgs in response.doc('.bigpic > img[src^="http"]').items():% U6 Z) I9 O9 g4 ~' ?/ r; Q
- img = imgs.attr.src" @! i" V6 a }! e4 s
- print img) W9 x `1 W- q2 O7 |9 F1 c
- #小说封面下载& Y. Y0 X8 T' E
- extension = self.getExtension(img)1 P7 A# M& J9 [5 O# [) }2 n4 P
- name = self.getname(img)
4 m3 ~- z6 Z6 r' Y7 r. A - file_name = name + "." + extension! b1 ~ t3 m3 q2 M3 Q/ r
- imgDir = P_dir + name: Y, o. m: S+ a$ k$ r" X
- Locaimg = imgDir + "/" + file_name
5 `( c! Q7 u+ u9 b8 z" U" p) V; V - print Locaimg0 b( R0 H# ^6 `; h: j1 x
- if(self.download(P_dir, imgDir, file_name, img)): #这2行可注译,图片下载到本地
( m0 Z+ }+ B* L e - print('attachment url is ' + img) #
; m# W8 S- L% y3 n - Datos = {
+ V; C- K9 c( X. J8 a" s - "Cater_Name":Cater_Name,# l$ R/ C9 X+ t1 O: X
- "Book_author":Book_author,
, O1 r+ s' o: x+ L& c( v1 u1 E - "Book_Introduction":Book_Introduction,
9 q/ Y3 i+ u* }$ m4 B - "Book_Synopsis":Book_Synopsis,/ n4 ^: A2 B5 C! V( e
- "Book_Palabras":Book_Palabras,. t! u$ D3 C) r2 s% B' F! M2 W m
- "img":img,( K* A+ B- Z- m" R: v
- }
3 G) E' E/ @4 t Q - self.add_BookFile(Bookname, BookIDs, img, Locaimg, Book_Dates) #这行可注译,数据库发布接口,方便其他系统的发布
& b7 g/ M- Y* x - for each in response.doc('div[class="bookbtn-txt"] a[class="catalogbtn"]').items():
2 l' `3 Q& u; x1 t - self.crawl(each.attr.href, callback=self.index_page,save=Datos): @+ |$ ?9 n9 X$ R% V. l! T$ p
-
! @7 l) G0 W0 w, p' w5 A, s0 F - @config(age=8 * 60 * 60) 8 [" X6 n, @7 d
- def index_page(self, response):
. C% f8 f, Q: ~/ W - Datos = {
+ l+ P# p" I$ ^& W9 | - "Cater_Name":response.save['Cater_Name'],
2 _9 G/ x- }- W8 K4 Y& x" C9 K8 r - "Book_author":response.save['Book_author'], {9 w2 c4 `# G& ]/ `; C
- "Book_Introduction":response.save['Book_Introduction'],7 V: D2 I3 v2 K Q4 w; q; s5 g* m. q
- "Book_Synopsis":response.save['Book_Synopsis'],: l: U# x3 o$ i8 K4 S7 F( s
- "Book_Palabras":response.save['Book_Palabras'],
; Y- t4 U0 `6 l ]/ v* O$ t) k - "img":response.save['img'],
$ T z9 N" ~4 x+ t4 D$ k8 N1 } - }
" }- V, C! ?% |( S - for each in response.doc('.chapter-list li:first-child a[href^="http"]').items():
% {- A8 W+ h3 O1 p+ s# F/ T( \ - # for each in response.doc('.chapter-list a[href^="http"]').items(): 6 K6 ]" E3 H6 ^6 A s( g
- self.crawl(each.attr.href, callback=self.detail_page,save=Datos)4 G( Y6 z p" m
- @config(priority=2)5 L9 A. o/ m' I7 A: |! e, r& {, \
- @catch_status_code_error6 @2 ?* J$ x8 v8 ?4 f5 T9 F+ C
- def detail_page(self, response):
7 z3 z) Y2 q ^ [' ~9 t - NewRe1 = u'哈书' C# m1 |1 G: b8 C
- NewRe2 = u'huhjsd.CC'( V& w( \# C' g' ?
- NewRe3 = r'^\\n\\n' t8 [* k$ B: j6 a' P
- NewRe5 = u'小说网'% q2 {. u) y& t3 m" O$ H; e: I; k
- NewRe6 = u'fgdfgf'; n0 E, t6 f8 D* G. }" A
- NewRe7 = u'fgfgf'
" C; f: z1 ~) X. z - NewRe8 = u'ffhgf'
. d# h% B6 l' ^( C$ ~ - NewRe4 = r'[\f\t\v+\.\{\(\)\}\!\/_,$%^*(+"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+'
, p2 |) T2 v+ i6 n5 q - ReC1 = u'静思'
2 f+ v# z) v$ R& z3 ^ - ReC2 = u'aghgf.com'; q8 K+ D- s2 q3 C3 a
- ReC3 = u'aghgfh.com'. }5 l4 b8 F9 J
- ReC4 = u''- m. Q! K" B9 B" r0 W7 Z$ C
- ReC5 = u'文学网'
) M, p8 W4 u1 {0 K - ReC6 = r'<BR>'
5 _; O" y8 f2 K - Bookname = response.doc('.readlocation a').eq(2).text() #小说名称
0 T: S0 A, B9 m- f: }* U) I, W - print Bookname
9 |- {" S4 t* P2 e$ L4 ~( {- { - Cater_Name = response.save['Cater_Name'] # 小说分类
5 ^ _& I" J e - Book_author = response.save['Book_author'] #小说作者
( V. v8 K1 C7 [) h5 _ - Book_Introduction1 = response.save['Book_Introduction'] #小说简介# r' t: X+ L! T" g
- Book_Synopsis = response.save['Book_Synopsis'] #最近更新
7 z) `/ j% \/ k$ Z - Book_Palabras = response.save['Book_Palabras'] #小说字数
: [* }' Y e, S- l, D w - Bookurl = response.url #小说网址
! T- B, Z/ |) C4 F. _( z9 c5 [ - Booktitle = response.doc('.article-title').text() #章节名称, |* F0 B2 U% x& O, Z/ c
- BookID = response.doc('.readset-r span').text() #小说ID2 z. x1 d$ b) [8 f( i+ D
- BookConte1 = response.doc('.article-con').text() #小说章节内容: R/ R9 I4 {5 Y! o6 J# E4 U
- abover = response.doc('.article-title').text() + response.save['Book_Synopsis'] + response.save['Book_Palabras'] + response.save['Book_Introduction'] #小说状态(连载还是完成)- e! P) |- ~. l
- Book_Date = str(datetime.datetime.now()) # 采集时间
- V8 \8 B0 Z. M0 Q/ z- d0 f4 E - BookConte2 = BookConte1.replace(NewRe1 , ReC1)' m r/ g# _- w' T& X S) `
- BookConte3 = BookConte2.replace(NewRe2 , ReC2)# z" j: D9 ?& A9 T9 U
- BookConte5 = BookConte3.replace(NewRe5 , ReC5)
7 j% u- h# G+ H8 t( ]# o |1 o- h - BookConte6 = BookConte5.replace(NewRe6 , ReC2)
5 H$ T2 V2 i; n1 w d+ U - BookConte7 = BookConte6.replace(NewRe7 , ReC2): Y/ l, B& _5 ~. d" J
- BookConte8 = BookConte7.replace(NewRe3 , ReC6)
" H. ~; H, B1 O" B# ~! Z8 q - BookConte4 = re.sub(NewRe4 , ReC4 , BookConte8)
; e: R) f6 Y& S8 w. j: q5 n - BookConte = BookConte4.replace("\n\n","<br>")
% d C' u2 @1 a5 L - print BookConte
; u7 {( A! l9 N5 M; B8 H; Z - Book_Introduction2 = Book_Introduction1.replace(NewRe1 , ReC1) t1 [1 H# L; |1 s# _3 ^, d: D1 N: }
- Book_Introduction3 = Book_Introduction2.replace(NewRe2 , ReC2)
4 n) {0 N+ x2 W$ ?. ~ - Book_Introduction4 = Book_Introduction3.replace(NewRe3 , ReC3)
9 }% Z- |$ _/ C# D. m. d2 M - Book_Introduction = re.sub(NewRe4 , ReC4 , Book_Introduction4)1 U+ s* c6 `6 [' q$ [
- Titleid = response.url.split(BookID + "/")[-1].split("/")[0]
$ G. N% u G" _. J. T - Book_img = response.save['img'], #小说图片7 n5 s2 E5 \1 z! c, ~- ?
-
+ d, [6 @: Y7 T( z7 @4 i. Y - #insert into MySQL 小说入库2 J. d$ E# c2 ?
- self.add_question(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布
* U/ L1 c3 B. W, y# \0 R - self.add_comment(Bookname, Booktitle, BookID, Titleid, Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布+ ?5 H; h [1 N; B: i
- #post提交发布
7 g5 e1 t! G0 y9 S5 j - self.add_locoy(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover) #这行可注译,火车头发布接口,不需要可取消* [- P1 b1 W& h }6 w' d$ F, c
- Datos = {
: o$ S% L) n3 d4 G! T - "Cater_Name":response.save['Cater_Name'],- V2 l# k7 k+ O9 l
- "Book_author":response.save['Book_author'],+ f0 i( G9 V" y8 b M2 M5 k
- "Book_Introduction":response.save['Book_Introduction'],; g; N8 H" q$ |2 M9 m( w7 m" ]
- "Book_Synopsis":response.save['Book_Synopsis'],# q. G% p+ f5 ~" W" Z7 b
- "Book_Palabras":response.save['Book_Palabras'], ^) q' [' x& P$ j& s& w
- "img":response.save['img'],
. e" d) u2 \2 l0 d, l - }( f4 D+ i0 W+ {0 {* E& K
- for each in response.doc('.articlebtn > a:nth-child(4)[href*="/xiaoshuo"]').items():0 j$ G9 r$ J' a _
- self.crawl(each.attr.href, callback=self.detail_page,save=Datos)
# d) X; i" w5 B1 z2 g, ^7 p - return {: V) H' ^3 U' i9 P6 S
- "Cater_Name":Cater_Name,
9 i; e4 w3 u! x8 | j1 e1 o5 k - "Bookname":Bookname,. K3 t. R; A0 U2 X! W- o' O
- "Book_author":Book_author,; N" t6 X7 A6 }9 B1 c" b# t
- "Book_Introduction":Book_Introduction,
) Q4 Z9 g% X: I+ f3 m$ Q6 ^ I - "Book_Synopsis":Book_Synopsis,9 N, y8 {* I8 J
- "Book_Palabras":Book_Palabras,9 I( r; n0 Q: ?" ?$ q- k" \6 g7 R
- "Book_img":Book_img,
! Y/ k% x+ s; h% [" |' R' e - "Bookurl": response.url,
. e3 y: V, S2 ] O( l - "Booktitle": Booktitle,- [' x, f9 j0 Y+ G! `1 f: k6 g
- "BookID": BookID, Y- U9 {4 ~) w/ B3 g
- "BookConte": BookConte,
- k$ \- N4 V9 [& p. a0 x - "Titleid": Titleid,4 d2 \! C, d7 [$ G6 I% |
- "abover":abover,& ?" }1 _6 N" \, x$ ^- I# y
- # "Book_Date" = str(datetime.datetime.now()),+ Z% B7 _0 A! f# Q; I' h& I
- }1 l* l* ]% I, P) t3 \3 s+ T
- def download(self, P_dir, imgDir, file_name, Book_img):, ]4 {0 k& d$ c9 c& M) `' [
- if not os.path.exists(imgDir):
) O4 i+ A$ i) a \. B/ q/ p - os.makedirs(imgDir)* H$ U- M8 T+ N4 }: W4 S
- file = imgDir + "/" + file_name9 _( {7 d' X" }1 P7 [0 d
- # print file3 s: M; t$ R D/ W7 B: ]
- f = open(file, 'wb+')* S: O5 J1 J/ {: w9 w$ ?
- imag = requests.get(Book_img) . ?: y+ k$ g! D2 {
- f.write(imag.content)
! u- T3 X# L2 x' \' @7 k - f.close()
% J+ V; q. f0 m- G2 R7 i: s; w# ^ - #保存图片前' ?7 a m5 q! _, C
- def save_imgs(self,response):
% w& o4 V% k0 {% T6 j( m8 O - content = response.content
( x4 w, x2 `1 L - file_name = response.save["file_name"]
% T1 w t( P. y h( `$ ^- n: T - imgDir = response.save["imgDir"]
; d/ v. q8 S$ m3 p- k$ B7 I4 S - file_path = imgDir + file_name
. _- K+ c5 e7 `( |! [2 |+ i& R# h - self.save_img(content,imgDir,file_path)
6 [, t* ~7 N" ?7 }& c- B - #保存图片
' ^; V0 S* j" k2 y - def save_img(self,content,imgDir,path):+ S* Y$ D" T: ^1 h
- if not os.path.exists(imgDir): % T6 }% _5 R- |% Q
- os.makedirs(imgDir)% a1 i# ?: ^- G: b
- f = open(path,"wb" )# h) N1 k1 h: j4 p: v
- f.write(content)" D! P8 _9 r& p
- f.close(), y" S' S# M4 ?( z5 q
- #获取url后缀名 w. f7 H$ h4 V% X5 I9 Y+ U, ^, g
- def getExtension(self,url):
; f* S) A) @1 Z! o - extension = url.split(".")[-1]
/ l1 \" l! v6 s& j* C" Q7 [ - return extension ( M$ P. [4 e& r! ?4 n# {
- ; ` I4 b0 ^- h* p
- #获取图片名
- E% ?& g1 ?2 }8 |1 Y - def getname(self,url):( z! c9 r: r5 h* y( Z
- name=url.split("/")[-1].split(".")[0]
& h" t& a9 D) {6 l, r - return name
复制代码 ( e: x+ D$ G5 K) Y" [/ O7 n( V
. s% K+ d6 L( \' o5 A |