Python + pyspider某小说站的爬虫,入数据库,火车头发布,资源下载到本地,另可写爬虫!4 v$ z& N9 v* N0 I4 I
- #!/usr/bin/env python
. R/ z) g" E( ?$ E - # -*- encoding: utf-8 -*-2 k0 Y; y6 x5 p9 o
- # Created on 2019-05-05 21:43:119 l: t, S6 z# a, k6 B& H- |
- # Project: XiaoShuo' h& l7 H' P1 c4 b
- $ n* |& C( \8 C2 F
- from pyspider.libs.base_handler import *
4 z( y% E" E( X - import pymysql6 @/ k6 j- u+ R9 Q# m4 F) G2 h
- import random
+ r3 V6 }- j7 g, |" g" L - import datetime
3 i& N; Z* H+ S; Q% [" c! j - import urllib2,HTMLParser,re
1 x4 r. C2 q% Q) j- G4 B - import os0 I% D: V/ i2 e7 ~. P: D# V4 _
- import sys4 L* O# c! Y* I+ N4 c
- import re
0 R) e( h1 u' @" g) C - import codecs
) [3 H. N4 w$ u4 s; I1 g4 [ - import requests! {4 D" V3 @: V/ n% g
- import json
7 B- E0 b* K( [- O8 d% ^& U9 V - 1 L' _- l8 |2 V b J. I6 A
- class Handler(BaseHandler):
6 U$ l" T& h6 Y( g+ G1 ~1 x+ h - global Datos( O0 N& D2 O( Z
- global P_dir 8 E: c3 P/ n$ R, |; L- |" _9 M
- P_dir = '/Tools/Debug/' #采集时候图片保持到本地的路径6 V' [& T3 l( P
- global Datos0 ], c$ G9 Q/ k4 J2 E
- Datos = {}0 @' o% @6 G7 p; F
- headers= {
+ u: q. I6 L' n8 ?5 k+ t5 q: E( e - 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',/ P7 g# _6 B: l; e' U4 T0 R
- 'Accept-Encoding':'gzip, deflate, sdch',8 k: _+ y7 m }: Z
- 'Accept-Language':'zh-CN,zh;q=0.8',) l9 l6 r+ u% Q# N
- 'Cache-Control':'max-age=0',
; _3 W1 y/ ~% t - 'Connection':'keep-alive',1 P0 R4 Y9 n& p/ D& F# p C, @
- 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'
% C5 b* e& V% \, d8 V$ L$ r - }
( `4 P- n% ? q7 w4 y - crawl_config = {
! t1 ~; J/ e: i8 ^9 |: k$ X: ~ - 'headers' : headers,' ~/ `* J# J! e; J
- 'timeout' : 300
$ m, g1 D) W* N8 Q, I* m/ e1 ~ R - }# z ?: E' B6 e' b8 x0 R2 T- K% ?0 K y
- def add_BookFile(self,Bookname, BookIDs, img, Locaimg, Book_Dates):( [" Q2 P% [+ v$ F y
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8"): ]4 U( z* u* a* }) k& J9 j2 k
- try:; R- G. i7 X* ?9 Q: k
- cursor = db.cursor()" i: Y6 h+ Y1 |/ N1 D& J7 W, }9 o
- #注意此处字符串的占位符要加双引号"%s"
( \9 V' T- F i - sql = 'insert into BookFile(Bookname, BookID, img, Locaimg, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, BookIDs, img, Locaimg, Book_Dates);: z. D2 J* H; z8 A$ j
- # print(sql)
5 D# }6 V0 ^% } - cursor.execute(sql)* Z/ \; @8 g& J0 a9 e( G
- 3 f' @% |/ H8 h) o
- #qid = cursor.lastrowid
! d, t/ N: g. {0 `* i9 o - #print(qid)
! b% R, r) e" Y& D0 K - 7 O; e/ c2 ?" p& F* b- p
- db.commit(); G- P, ~2 t* Q0 Z y# c D
- except Exception as err:
; W9 Y/ H9 x* I, } ~) }; D% H% F" ? - print("Error %s for execute sql: %s" % (err, sql))! C5 P9 s4 |6 p) W3 A- m% |
- db.rollback()
- p8 T" D5 d4 F6 a( |! n - def add_comment(self,Bookname, Booktitle, BookID, Titleid, Book_Date):
3 V+ ^3 O$ z) W) K5 t - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
( t# z% [8 J1 \* s; A4 J - try:
. H- z( s% b0 c9 m, {0 {; ]% Z6 _ - cursor = db.cursor()
' i2 u- x9 m0 Z y0 N! D7 P - #注意此处字符串的占位符要加双引号"%s" q9 E# [9 L6 X7 v- q% r
- sql = 'insert into BookTitle(Bookname, Booktitle, BookID, Titleid, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, Booktitle, BookID, Titleid, Book_Date);
: _/ v% N4 `/ |% g4 z2 P6 D9 \1 s* d - # print(sql)4 q/ |, u! _7 }2 v
- cursor.execute(sql)
2 k3 b9 A0 u; H6 k7 M2 O& B6 M - # r" z& w8 B0 j' k9 E; ^; P/ L
- #qid = cursor.lastrowid, D) t$ G+ h1 Y n- m
- #print(qid)
7 l# K6 l+ v( K8 W4 L+ f! | - / m+ A/ \7 L# z* t7 o
- db.commit()
! ?: \7 p$ ]/ Y5 Z6 T7 ?5 k - except Exception as err:# ~: V1 `& y% Q( O7 f
- print("Error %s for execute sql: %s" % (err, sql))% k) n1 M! d/ E( |4 ?# p! d
- db.rollback()
$ C: D, Q( Q8 R- [" G6 s) n/ c - def add_question(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date):
4 t/ p4 G' ]5 R: a: |: i9 F/ n* s - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")8 L: {5 o: k. [3 o
- try:# c, W$ v" l! Z
- cursor = db.cursor()
8 R( I4 w9 E3 t7 f - #注意此处字符串的占位符要加双引号"%s"
6 C5 Y' G/ w% `* B! D - sql = ' INSERT INTO BookConte (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date);& ]0 O0 H5 c3 H. \) Q1 r
- print(sql)3 N+ Z7 n5 {+ ] j% Q* L0 B$ T6 h0 w
- cursor.execute(sql)
/ Z$ i3 ?8 k% C n - print(cursor.lastrowid)
8 m4 @. o: W" c6 T; b# Q5 m - db.commit()
, l* Q* G( X/ w1 W5 r; U5 d - except Exception as err:( |6 _' } ^- B5 P* l
- # except:3 D. H' f- ?8 B$ k {) M& C
- # print('Failed')
& E4 H! F9 n. O6 L$ c - print("Error %s for execute sql: %s" % (err, sql)): H2 ~2 f! M* Q" J, w% i
- db.rollback()
$ h' i! A: w8 _% i6 |6 y) ?( v8 t - & p! F' R% b4 ]
- def add_locoy(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover):
* d# P" x* w+ k) B0 a7 k! [6 K2 g, w - reload(sys)1 x. M2 y. z7 K8 x
- sys.setdefaultencoding("gbk")
8 D7 z2 I' [2 W( r k - locoy_url = 'http://www.******.net/locoy/?my=book' #697火车头发接口地址
. I) ?) I* N$ d4 u2 o' X/ n - locoy_data = {& G/ c+ V* }) X8 ~1 Q! p
- 'my_u':'用户名', #后台用户名
3 @7 n) _; P$ F2 o! X1 s2 Q# K - 'my_p':'密码', #后台密码$ |+ @6 l) @/ \" u' I o
- 'subject_669977_net':Bookname.encode('gbk', 'ignore'),. v# S1 |3 q/ ]6 r% g; s
- 'caid':Cater_Name.encode('gbk', 'ignore'),
" s; K- i8 ^8 X5 g+ ~( s! \ - 'title_669977_net':Booktitle.encode('gbk', 'ignore'),
: B. ` R0 S: B N' X! a( z - 'article':BookConte.encode('gbk', 'ignore'),; u( z0 R- L) U$ ?* V3 N3 S, G, |
- 'author':Book_author.encode('gbk', 'ignore'),
# q! w7 ?$ D/ {5 l* z# t - 'ready_1':Book_Palabras.encode('gbk', 'ignore'),
" h, Z2 q0 X q: e - 'thumb':Book_img,
: g5 I5 M. A2 v+ p+ J - 'content':Book_Introduction.encode('gbk', 'ignore'),4 q' ?1 j3 W- ]# ]3 j; g
- 'abover':abover.encode('gbk', 'ignore') - i8 m% c$ s3 k! S& u2 O5 l2 j
- }; M1 n6 A P2 w9 j4 L% @
- res = requests.post(locoy_url, data=locoy_data)8 Y/ F* j" E. J
- print res.text0 a: f# A- j3 i- B
- print res.content
0 Q" L3 L" r/ ~0 D/ I - # print Dsd- e% t/ U, ?# Z& H d
- return res
/ t4 l$ q; {: @3 M. h# w( T - 7 y3 g! P3 v9 Z7 o+ s% [/ Y, q
- def __init__(self):( E7 O6 ?- }, X- [5 A- v4 ]
- self.base_url1 = 'https://www.****.cc/'( H9 R! v, \9 F. s! g5 z8 o8 |. x
- self.base_url2 = '/'
8 q. Y- O2 _# C' k5 Z - self.CaterId = []
7 r2 J( z8 C% q6 P+ j: l - self.CaterIds = ['xuanhuan', 'wuxia', 'yanqing', 'lishi', 'kehuan', 'kongbu', 'nvsheng']
+ e/ ]8 H5 m: l. T, C/ x) J" f4 ^ - self.page_num = 1
" \$ w6 K+ b2 _, b$ n* A - self.total_num = 200 - B4 v+ p8 @% \* D g
-
8 z4 ~# L+ s5 v/ L, i/ | - @every(minutes=8 * 60)+ u; [' X: w: r( p% i
- def on_start(self):
* \- }8 f/ g0 Z - global Cater_Name* m! d- ~, r& N% j/ X8 Q
- Cater_Name = []" Y$ d2 ^# P. A" V; R5 d/ d! B3 \
- while self.page_num <= self.total_num: ; A0 E9 {' W9 f7 }7 m4 y
- for self.CaterId in self.CaterIds:
, q7 E" u0 T7 r4 X - if self.CaterId == 'xuanhuan':) L' [% ?! t3 r
- Cater_Name = '玄幻'
! ?) h g6 G5 q4 Z; _8 A | - if self.CaterId == 'wuxia':
0 x, ]7 c: k! t* r/ m% h+ I& k - Cater_Name = '武侠'" T4 f+ _9 J* y
- if self.CaterId == 'lishi':
$ M; ?& X; E! t/ @$ ~ - Cater_Name = '历史' + Q, S( N0 f0 n5 \ D( f. A" M- J
- if self.CaterId == 'yanqing':4 h% x: U( }; b4 v7 `9 l. r
- Cater_Name = '都市' 9 R3 x9 J z. U8 S
- if self.CaterId == 'nvsheng':" s- w1 |3 b4 Q3 R
- Cater_Name = '都市' 6 I# U% q+ ?0 }( \
- if self.CaterId == 'kehuan':; N% O0 Z+ }6 ^ G2 I+ C5 ]
- Cater_Name = '科幻' 2 [2 v) d% ^! [/ k
- if self.CaterId == 'kongbu':. Q+ q* a4 q6 D) m3 G
- Cater_Name = '游戏'
0 J$ a; V* a, X$ p+ H% ^ - print self.CaterId7 e; w; b# q% E- r/ K! P
- url = self.base_url1 + str(self.CaterId) + self.base_url2 + str(self.page_num) + "/" 8 Y0 S& W4 X4 A! S# L p0 f. H
- self.crawl(url, callback=self.list_Caterg,save=Cater_Name)( v, d! r- ^ n+ H+ G
- self.page_num += 1
) j; T4 m: [% q( q2 C1 t7 p -
8 Q# V T' p+ q' F3 W6 w - def list_Caterg(self, response):$ f9 \; Q0 }; }
- Cater_Name = response.save7 M; `5 S$ ^ ?2 M1 Q- ~
- for each in response.doc('.pic-list a[href^="http"]').items():
: ^4 |+ K. L1 g - self.crawl(each.attr.href, callback=self.list_Caterg_detail,save=Cater_Name)
2 m y% _9 T/ l% x -
5 X3 K$ ?9 v0 w0 l - def list_Caterg_detail(self, response):( \- w6 @( L3 i/ s+ \" G
- Cater_Name = response.save3 o# C Q$ M! U: t- R+ g; e
- # print Cater_Name
2 J c; ?9 l8 r0 l8 f, ~ - Bookname = response.doc('h1').text()
) b6 {& {. R4 [2 }6 n - print Bookname' F9 G5 N9 t9 z g+ Z
- Book_author = response.doc('.authorname > a').text()2 K, T/ M0 p7 [3 ~& U! M0 k
- # print Book_author
7 q: I4 `# X; n0 h. J - Book_Introduction = response.doc('.book-intro > div').text()
3 Z# Z5 H6 p# w - # print Book_Introduction
* z% F; P3 S9 L: O) C5 I - Book_Synopsis = response.doc('b').eq(1).text()5 J) F! T+ G* b' A# M* ^; q3 N, y
- # print Book_Synopsis
' |8 T5 ]5 z% q" m3 p' d - Book_Palabras = response.doc('.booktitle p').text().split(' ')[1].split('|')[0]
: G1 K, E, x0 b9 s. W4 M$ B - # print Book_Palabras& H' Q$ H3 o0 Q( O
- BookIDs = response.url.split("xiaoshuo/")[-1].split("/")[0] #小说ID
0 @1 J0 D. ~: Z - # print BookIDs3 X# T( f4 J; v w/ z5 c( w5 \! @
- Book_Dates = str(datetime.datetime.now()) 8 e1 R- ~2 K6 g( T
- for imgs in response.doc('.bigpic > img[src^="http"]').items():$ H' n" s/ r; z3 H7 {8 D
- img = imgs.attr.src) w) [" ]; H0 ^
- print img
/ N( b! W, ^" c2 l& `$ ?) X - #小说封面下载
5 T5 N4 \2 S9 V; P- W- E, E - extension = self.getExtension(img): z1 u5 X8 E8 V( `
- name = self.getname(img): p2 R2 K. \) v
- file_name = name + "." + extension
" O R3 ]. ^8 V/ @5 U - imgDir = P_dir + name
/ G, |9 y6 r. s1 q5 X( p - Locaimg = imgDir + "/" + file_name
# J( D5 D2 S' y, Z" ? - print Locaimg
% v! y3 F6 X' S2 s9 r& x6 O6 g - if(self.download(P_dir, imgDir, file_name, img)): #这2行可注译,图片下载到本地
2 a4 P ]4 ~; r* \) O - print('attachment url is ' + img) #
+ q+ x0 `! r& x1 @- s* L - Datos = {
) K* c8 k5 L4 f; F - "Cater_Name":Cater_Name,. j) K9 ?7 R4 g
- "Book_author":Book_author," O, M& g3 ?5 J: B: s
- "Book_Introduction":Book_Introduction,3 Z* a. T# {6 r
- "Book_Synopsis":Book_Synopsis,: N' H) }7 _$ Q6 U3 D
- "Book_Palabras":Book_Palabras,/ D/ P/ W2 u! T- v5 d* L
- "img":img,' c# B+ g2 D' l. L, _1 V
- }
0 C H7 T4 i+ d3 ~! Y6 i T s - self.add_BookFile(Bookname, BookIDs, img, Locaimg, Book_Dates) #这行可注译,数据库发布接口,方便其他系统的发布
& z) ]) G8 x& i, d7 ` - for each in response.doc('div[class="bookbtn-txt"] a[class="catalogbtn"]').items():
2 \" K; t6 ]0 _" E* M" \ - self.crawl(each.attr.href, callback=self.index_page,save=Datos)
' W0 x+ ^3 l4 u' ~6 t6 e! e -
) `& P: Z' B$ h0 d! p+ C - @config(age=8 * 60 * 60) & v5 \& q) L0 p- a( Z. H& l
- def index_page(self, response): W8 Z" Z7 p( T! b8 Y; A
- Datos = {
; Z7 M: K2 O9 c* W4 X- Y" H - "Cater_Name":response.save['Cater_Name'],
1 r* j/ K7 V E" K$ W( D2 u - "Book_author":response.save['Book_author'],
6 {4 n6 L9 U$ C6 N, L - "Book_Introduction":response.save['Book_Introduction'],9 }7 o9 H q% q1 e
- "Book_Synopsis":response.save['Book_Synopsis'],
7 ~, d9 Z* f* o1 {' ?) j7 ~0 ^ - "Book_Palabras":response.save['Book_Palabras'],
7 T$ Z: `" z3 f# |( i7 e. m- ~ - "img":response.save['img'],
9 |/ ]9 J- R4 S4 M& J - }9 w- i- x; n. b$ m% I* U' E
- for each in response.doc('.chapter-list li:first-child a[href^="http"]').items():
% Z, w S+ |) O8 \; \6 Y - # for each in response.doc('.chapter-list a[href^="http"]').items(): # j- E8 ~( m6 d0 v. I$ v
- self.crawl(each.attr.href, callback=self.detail_page,save=Datos)
- |; \ q1 Z0 Y z- T9 q - @config(priority=2)
( o1 D) K0 x! K# k2 m- V: z - @catch_status_code_error+ h8 y7 M8 X8 Q7 I1 n7 _+ E d- [7 p. A
- def detail_page(self, response): # {$ I4 l3 O! K
- NewRe1 = u'哈书'7 s0 ~6 z; m- F# A" q7 V, k9 |
- NewRe2 = u'huhjsd.CC'2 ]4 q) o! J; }- a
- NewRe3 = r'^\\n\\n'
5 v4 Q! T5 P% W5 C6 q+ M - NewRe5 = u'小说网'
) ^& C' |" ?. ?2 ^' T - NewRe6 = u'fgdfgf'' h0 J1 P v# l1 F# Q
- NewRe7 = u'fgfgf'0 ^- D ?* f$ w; F9 f, [8 I# @
- NewRe8 = u'ffhgf', j8 H' Y( g3 J: ^% l3 d
- NewRe4 = r'[\f\t\v+\.\{\(\)\}\!\/_,$%^*(+"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+'
- C# y- E* T2 J: @ r- B- v - ReC1 = u'静思'
& y$ D% r/ t7 \8 u/ t - ReC2 = u'aghgf.com'
: m4 ]( v! k4 u# p f M - ReC3 = u'aghgfh.com'. K Q c) _# J6 ]8 W7 C
- ReC4 = u''7 C9 y" x8 d: Q5 X4 N
- ReC5 = u'文学网'
! L* F! a* c' M- g - ReC6 = r'<BR>'
1 ^1 g$ l5 S4 [- p# I: H" L. Z% y - Bookname = response.doc('.readlocation a').eq(2).text() #小说名称
" u# n6 x! }/ C# ^( K+ D' ~4 T - print Bookname; F0 K' w: W1 O
- Cater_Name = response.save['Cater_Name'] # 小说分类% e `* o% Z) h8 m
- Book_author = response.save['Book_author'] #小说作者/ M$ h* Z2 I7 @" X- o
- Book_Introduction1 = response.save['Book_Introduction'] #小说简介- M1 T: z% l v. u4 [& H2 {
- Book_Synopsis = response.save['Book_Synopsis'] #最近更新0 K' W0 k# v( d b d0 Q# D
- Book_Palabras = response.save['Book_Palabras'] #小说字数
b$ y, q* Q+ T3 D; g - Bookurl = response.url #小说网址
$ H& f1 w8 g6 S& r' k+ B - Booktitle = response.doc('.article-title').text() #章节名称
$ H) U$ `& I( [ Y1 u2 s - BookID = response.doc('.readset-r span').text() #小说ID e* Q1 ?- r& H4 j( i4 B5 |& g% B
- BookConte1 = response.doc('.article-con').text() #小说章节内容- I9 A( b$ p3 M# L
- abover = response.doc('.article-title').text() + response.save['Book_Synopsis'] + response.save['Book_Palabras'] + response.save['Book_Introduction'] #小说状态(连载还是完成)
/ S) P Y+ _* n7 y% w! } - Book_Date = str(datetime.datetime.now()) # 采集时间
0 E5 x5 v! f: @7 B/ j: J - BookConte2 = BookConte1.replace(NewRe1 , ReC1)
- P4 r0 i N4 j$ C3 f* k9 v9 ~ - BookConte3 = BookConte2.replace(NewRe2 , ReC2)
4 w# @& h; ?5 F$ a7 I - BookConte5 = BookConte3.replace(NewRe5 , ReC5)
6 B) ~( Q3 q+ p& h7 l# V2 F; r - BookConte6 = BookConte5.replace(NewRe6 , ReC2)
* O' S# G8 e& v% m - BookConte7 = BookConte6.replace(NewRe7 , ReC2)0 ^/ C& a+ W2 d# \ T. g5 F
- BookConte8 = BookConte7.replace(NewRe3 , ReC6)
3 W: E7 ?2 y- V - BookConte4 = re.sub(NewRe4 , ReC4 , BookConte8)
0 d2 w2 e A0 D+ g - BookConte = BookConte4.replace("\n\n","<br>")
% l' E3 L* \ u, M - print BookConte! B8 f+ M0 i9 ?1 C( v/ p2 e
- Book_Introduction2 = Book_Introduction1.replace(NewRe1 , ReC1)
- Q/ a" i4 D7 a6 @ - Book_Introduction3 = Book_Introduction2.replace(NewRe2 , ReC2)
' m! q- J- O) v% H9 ^5 Y - Book_Introduction4 = Book_Introduction3.replace(NewRe3 , ReC3)
( g" E* i2 l, F$ v. T: @ - Book_Introduction = re.sub(NewRe4 , ReC4 , Book_Introduction4)' w/ c3 {/ \2 D# l+ D
- Titleid = response.url.split(BookID + "/")[-1].split("/")[0] ' Q. G- ~* x8 P4 h2 i
- Book_img = response.save['img'], #小说图片
( a8 B- X+ U3 D9 S3 ` - g" O0 D9 W/ Y
- #insert into MySQL 小说入库
. S- D. v! [$ m- R6 A - self.add_question(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布: c0 ^6 V9 g% _$ s* u% I
- self.add_comment(Bookname, Booktitle, BookID, Titleid, Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布; _! g$ W2 ^% @
- #post提交发布, u" f# d8 k0 t2 l0 ~
- self.add_locoy(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover) #这行可注译,火车头发布接口,不需要可取消
0 f! o5 n+ N2 C3 e. M- c: c5 n - Datos = {( ? W( l) v7 w+ z
- "Cater_Name":response.save['Cater_Name'],+ i0 O% d: b) @+ X% j$ C
- "Book_author":response.save['Book_author'],8 I8 Y8 Q! E: q% n4 C
- "Book_Introduction":response.save['Book_Introduction']," z) p ^, W' j+ K# d' h
- "Book_Synopsis":response.save['Book_Synopsis'],- M/ u3 f* ?5 Q( Z5 h% C
- "Book_Palabras":response.save['Book_Palabras'],3 ~6 f/ \7 H8 t D6 e" L% |& d
- "img":response.save['img'],
. W. d* Y/ ~$ N- n - }
& D+ I! q8 t" t5 y - for each in response.doc('.articlebtn > a:nth-child(4)[href*="/xiaoshuo"]').items():
5 F5 [2 {9 R, Q- q8 K - self.crawl(each.attr.href, callback=self.detail_page,save=Datos)
1 t; b9 [4 ^7 s! ^ - return {( g! H4 _5 [/ m' i
- "Cater_Name":Cater_Name,
" L* n2 p- V% b( A$ l, S - "Bookname":Bookname,' ~, U ?# d% k
- "Book_author":Book_author,
; ?/ k8 v" m; C% X: Y: B H) e: B - "Book_Introduction":Book_Introduction,
$ w2 z( c; e; G - "Book_Synopsis":Book_Synopsis,
/ r! b4 e0 s8 Z: L7 A8 @ - "Book_Palabras":Book_Palabras,, I" m9 J: {- d+ S y4 C7 w
- "Book_img":Book_img,* ^1 p9 Q) m) t
- "Bookurl": response.url,2 L. l) o/ _) B( A }
- "Booktitle": Booktitle,' {5 o8 m& g5 c6 Q" L, n, A
- "BookID": BookID,
9 {: @7 p6 V" S# B7 k7 N% k% `- R9 v - "BookConte": BookConte,
$ j/ e; ~8 y5 ^: e8 C( Y( r - "Titleid": Titleid,
+ }% H* C8 ~! i$ b9 Y - "abover":abover,/ ^2 @$ j4 B9 p5 N4 d
- # "Book_Date" = str(datetime.datetime.now())," E8 \5 N/ }7 B! q& f
- }, F- H) E5 P" N9 `+ r; V
- def download(self, P_dir, imgDir, file_name, Book_img):
$ z) S4 U3 h+ J5 i2 P - if not os.path.exists(imgDir): * \7 }* [" W, ^; y
- os.makedirs(imgDir)
1 m0 f1 \$ c* h$ }+ x- E - file = imgDir + "/" + file_name0 r5 |5 P. i# u, R, b3 ?
- # print file
- q( D! [7 i7 E - f = open(file, 'wb+')
$ Q! E) I) k7 s4 r1 `) P% t# o: a. Z - imag = requests.get(Book_img) ( k1 h7 E6 B: `/ {( p: V7 M8 {4 A1 t$ I
- f.write(imag.content)& K( u8 H5 {) H/ c# R
- f.close()$ J# Y$ p( V8 g: l0 X
- #保存图片前6 s/ V) [' [" `1 Z4 K
- def save_imgs(self,response):6 L# x B0 ]- l- |5 N3 w1 x
- content = response.content
2 r5 B3 e) Z1 s; E2 a4 N) s - file_name = response.save["file_name"]
7 S3 m- A/ [4 F0 a2 o - imgDir = response.save["imgDir"]: I1 ?0 e9 i* u" w9 n: \' Y* `8 v
- file_path = imgDir + file_name C: U! A" `+ U; [
- self.save_img(content,imgDir,file_path)
. N, p$ Z* L8 V' } - #保存图片 m3 O5 ?) Q( M% J
- def save_img(self,content,imgDir,path):
. n$ ?& K; I0 I" o& p - if not os.path.exists(imgDir): ' y" t5 ]( {1 a+ o+ ^; m
- os.makedirs(imgDir)# m9 S% A1 R4 ]1 }3 S
- f = open(path,"wb" )
; }5 d' R$ m, G5 p V, L; M" _, n+ r - f.write(content)
6 D2 d2 M( s" L0 \ - f.close()+ O: s6 D+ W- z) c f+ k3 u' _% n: q
- #获取url后缀名* X* h& k" U n# d1 L& l
- def getExtension(self,url): 1 n- v, U' E$ T- v
- extension = url.split(".")[-1]) c. v0 K% }! R: m, b+ C
- return extension ( \( R4 C& z% n4 X" S
-
- i. e5 r5 q* F) M( n - #获取图片名
; G% f' Q/ R1 Y# M4 \) W$ `6 k - def getname(self,url):
! ^9 ~ L4 M4 B6 s& c, k - name=url.split("/")[-1].split(".")[0]8 h0 o( s' w/ L) h! M7 q; i9 i
- return name
复制代码 3 v; Z; A2 I! y- a3 b
: c5 A5 m. @- K2 m$ z% { |