Python + pyspider某小说站的爬虫,入数据库,火车头发布,资源下载到本地,另可写爬虫!
! e9 w+ Q) P3 U- #!/usr/bin/env python
: E+ ~3 w: g( @* s+ W1 M - # -*- encoding: utf-8 -*-
' H" s( q0 y, e# R- J! x1 m - # Created on 2019-05-05 21:43:11- z4 r. L- k1 [; K
- # Project: XiaoShuo4 W, W: R9 {9 `9 I
- " N V. U' n$ G
- from pyspider.libs.base_handler import *3 F, e7 U: ?5 ^% u8 Q
- import pymysql
9 x: |2 u+ X e1 r2 _$ w% ~( |& F - import random8 ?$ Q+ d9 V& l. K& P$ D9 D
- import datetime
1 R. y: f/ \3 u4 y+ F$ M - import urllib2,HTMLParser,re8 |, O( O& c+ [+ ^6 Y
- import os
# U2 s8 L0 ?! Q& \3 A - import sys2 r6 H6 h. J! C, T5 W
- import re* s* v& m$ k1 V1 W4 W1 h4 h
- import codecs
1 i$ f) h2 y7 A. P/ B+ e. o8 L - import requests, X0 T, Q2 q6 U7 T
- import json2 Y% @& X/ G1 ?* H# j7 u# |& p
-
g. v$ f2 r: ]6 x9 O1 |2 [# D - class Handler(BaseHandler):/ a( L3 q# ]8 h0 F! m1 e
- global Datos3 y( R( U7 T( E; S
- global P_dir
( y- @3 [: N8 H$ U6 V( e5 \ - P_dir = '/Tools/Debug/' #采集时候图片保持到本地的路径' K* l* \$ d( Q' O5 R7 s
- global Datos
1 w/ A8 l# [% Z$ D4 j3 x - Datos = {}" d1 `% C0 V- [$ R) h
- headers= {* m0 \8 C \' `& u
- 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',! T# b3 H' C9 }/ z& ]% e
- 'Accept-Encoding':'gzip, deflate, sdch',
6 a* ?' i F: @* ? - 'Accept-Language':'zh-CN,zh;q=0.8',5 d: N' o* A7 S$ b' \* W% i' r
- 'Cache-Control':'max-age=0',& [5 v( \1 u* g) T, `5 X
- 'Connection':'keep-alive',4 R, {& F- Q- N( }. ~# x
- 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'/ i# Y5 q6 J; K4 x- C
- }8 M ?+ p3 `4 W/ l7 n* p% K: B
- crawl_config = {& I, T/ j7 y6 y" M/ @, h
- 'headers' : headers,. {0 P+ D1 U7 ], n. j" ?4 D3 f/ a
- 'timeout' : 3006 X0 H* S0 M! e" w4 m( U; Q! Q2 B
- }
& r' V) _1 T+ B# f: Q - def add_BookFile(self,Bookname, BookIDs, img, Locaimg, Book_Dates):
6 J2 J) Y& z2 W, J - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
! c4 _$ n" Q: ~0 A E - try:
! b) q) \3 } I& @ - cursor = db.cursor()
/ r K1 Y7 ]3 o/ I3 T! n1 P - #注意此处字符串的占位符要加双引号"%s"( v9 c6 E! l5 Q4 j. D: j. {
- sql = 'insert into BookFile(Bookname, BookID, img, Locaimg, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, BookIDs, img, Locaimg, Book_Dates);
& N7 o3 g% ?4 w2 l ] - # print(sql)
- r& g: W2 y; h& S2 d! m2 i5 I' S8 g - cursor.execute(sql)$ N" J5 g* ] a9 Y: O% E
-
/ o5 K" w' G) u. j* [) D! m3 S - #qid = cursor.lastrowid/ J& b3 _3 }9 Z9 K6 e& z* H# q
- #print(qid)# K" F! c+ d6 c8 n1 N4 }
- % q6 ^8 h. p8 d
- db.commit()1 o1 }7 ` c6 U# [1 \/ U/ I) q
- except Exception as err:
& e V8 L$ v! m3 X9 {3 t$ ~4 E - print("Error %s for execute sql: %s" % (err, sql))
8 g" g$ S o$ o# o$ G) k - db.rollback()
- D. P+ h) r) Y* C& ^2 ]9 D* c - def add_comment(self,Bookname, Booktitle, BookID, Titleid, Book_Date):
1 e. B0 Y+ H( B8 w - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
) V, }( A2 x8 S) o, D# r1 Q - try:+ W3 w; i7 y, R/ a6 m. _9 i' j
- cursor = db.cursor()
% D4 r( g* E1 |" S" M - #注意此处字符串的占位符要加双引号"%s"
1 s/ A1 ~# C& K' _% W' g - sql = 'insert into BookTitle(Bookname, Booktitle, BookID, Titleid, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, Booktitle, BookID, Titleid, Book_Date);* b. ^2 T! d. l8 j0 |
- # print(sql)1 q( u! ?' o3 F7 \: J
- cursor.execute(sql)
* H7 o8 X- z: l5 L. N- H# [* f -
0 w- F6 V& q! j8 f/ j - #qid = cursor.lastrowid
7 |' e- ~ @* G: g+ Q+ ^ - #print(qid) F/ E9 M* F9 E
-
7 b8 F5 H8 Z- e, [: S0 p3 a - db.commit()
0 [- a9 a5 p/ g |5 z& @ g2 m - except Exception as err:
7 a7 F* h0 D% Q4 z# S! s _ P* Q6 D - print("Error %s for execute sql: %s" % (err, sql))6 G, a6 h! q5 L5 i/ d7 t8 c
- db.rollback(), m( R6 h `! S& w
- def add_question(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date):
5 C! E2 g3 `7 @. I - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
# H, u/ C4 e9 X, `5 V - try:
4 o5 ~* d( j& u7 P - cursor = db.cursor()6 Q# D" F: w% `
- #注意此处字符串的占位符要加双引号"%s"5 _% R" o$ W# f. Q* K) |4 F
- sql = ' INSERT INTO BookConte (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date);( m, o9 r* A k2 F
- print(sql)8 \/ H: [! O6 f
- cursor.execute(sql)
2 x0 l1 D: h( b1 C& |& J - print(cursor.lastrowid) s0 ?# R4 D* [& n& O& o4 x
- db.commit()' V; d" o( d# w; C+ {
- except Exception as err:
8 j( d* y9 A1 v3 P - # except:
. J8 f: u% |/ H" K - # print('Failed')8 v! m$ C( u* b1 h7 k' k
- print("Error %s for execute sql: %s" % (err, sql))
4 I* F+ Q1 y) m* K - db.rollback(); X- [6 o3 [* f$ r* V3 c( k* x1 |0 e& p
-
/ w+ d! v- ]5 j/ J0 _5 T - def add_locoy(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover): 0 \+ F @0 m7 S& M- k$ s
- reload(sys)
# {3 d. f& l# u( i2 j - sys.setdefaultencoding("gbk")
; X* a) o8 E7 _/ W# } - locoy_url = 'http://www.******.net/locoy/?my=book' #697火车头发接口地址5 [2 {. {: ~( ~, V
- locoy_data = {
. e9 j6 I; M& ] - 'my_u':'用户名', #后台用户名
1 X" e9 J% G6 @- u, {. v$ q - 'my_p':'密码', #后台密码
4 F$ r$ \5 O6 G/ S; n4 X. i2 L - 'subject_669977_net':Bookname.encode('gbk', 'ignore'),6 I# V. |; E' |) F/ w
- 'caid':Cater_Name.encode('gbk', 'ignore'),3 D+ }- s r! h: e1 r4 F u
- 'title_669977_net':Booktitle.encode('gbk', 'ignore'),
% H9 L& W9 d/ C6 N$ h9 z- j# b - 'article':BookConte.encode('gbk', 'ignore')," R3 t3 L* ]: u3 M! `, G9 C3 y. X
- 'author':Book_author.encode('gbk', 'ignore')," S- R' D4 g1 d
- 'ready_1':Book_Palabras.encode('gbk', 'ignore'),
$ i$ m1 {' w$ A+ Z, u8 y. M - 'thumb':Book_img,
1 e5 J- G- P$ O o& i6 f, M - 'content':Book_Introduction.encode('gbk', 'ignore'),
& O2 k3 f$ f$ A3 Y. ?3 P4 p- b - 'abover':abover.encode('gbk', 'ignore') ' R% Z: r1 e6 x% M2 B. J
- }
! H X' P& `, g1 F! [* o$ Q - res = requests.post(locoy_url, data=locoy_data)
v( k2 h( d* Y u9 S - print res.text4 C% c% ~/ N7 L3 w3 ~ n
- print res.content
$ B9 S! p1 ?9 \$ H) F: M: L - # print Dsd8 b( W* F. Y8 O: C5 v$ Y Y+ [
- return res
8 v% F4 w; X: Y6 v: N6 c -
/ J; A6 _, M0 ~ - def __init__(self):: U \" N" [9 s4 t/ c" q5 D7 {+ O
- self.base_url1 = 'https://www.****.cc/'5 l1 [3 x/ N7 `$ w6 Q
- self.base_url2 = '/'
2 H* P. ?$ d) |; \ - self.CaterId = []
' {$ }1 e; u7 D! { - self.CaterIds = ['xuanhuan', 'wuxia', 'yanqing', 'lishi', 'kehuan', 'kongbu', 'nvsheng']$ C+ }0 H- h2 ~ f; ^! b/ ?
- self.page_num = 18 R! P+ v5 l( m! c3 C
- self.total_num = 200 6 G E8 Y5 C7 m
-
& l" i4 T) U G& I - @every(minutes=8 * 60)6 ]: }9 s- B& K) j/ ]* M6 F9 [$ s8 c
- def on_start(self):
- w2 _ g/ z) v, I - global Cater_Name) I+ G4 X( P/ k6 Q. w* G
- Cater_Name = []
4 |2 _( L* K; {5 g8 \' N - while self.page_num <= self.total_num: U @+ b, \1 M; O
- for self.CaterId in self.CaterIds: w9 a' v- V" b* ]$ M) [5 t
- if self.CaterId == 'xuanhuan':
- K3 v4 p% Z) ?& U' U7 o- L/ F - Cater_Name = '玄幻'
3 L: `% s7 T# ^5 X# w/ }) v% l - if self.CaterId == 'wuxia':8 Y \2 |4 p" U% Y/ c
- Cater_Name = '武侠'
) e- c* E1 `% P5 h - if self.CaterId == 'lishi':& C+ \9 a& }5 g2 Q
- Cater_Name = '历史' 9 j) ^; E- x+ U
- if self.CaterId == 'yanqing':
; f& s* `6 d r8 N - Cater_Name = '都市'
3 d* S0 a1 F4 k' V$ E# U% [# ~ - if self.CaterId == 'nvsheng':
" g( Y& \% S% ^' g7 s2 h! P - Cater_Name = '都市' % W& @3 `* ^; }
- if self.CaterId == 'kehuan':5 n6 n6 C2 W' z, i- m
- Cater_Name = '科幻' 9 d. x' Z/ m& D) x; A. N1 m2 I+ m
- if self.CaterId == 'kongbu':$ Z! x6 S/ c; F! Z
- Cater_Name = '游戏' - A4 v3 l" h" K& @! I: K
- print self.CaterId
' f/ |) E% w8 ` - url = self.base_url1 + str(self.CaterId) + self.base_url2 + str(self.page_num) + "/" 4 h8 Q/ a' x B. Y& W, b+ b$ J
- self.crawl(url, callback=self.list_Caterg,save=Cater_Name)6 D; v% L( j* K0 ~% Z0 H2 p( r
- self.page_num += 1
; o* V4 j0 l9 {; B -
. \: s+ y- z: T& D* B& @, }- s - def list_Caterg(self, response):
( r! ^# o' I4 b" z) q& o6 } - Cater_Name = response.save" @# ~2 [$ J" g- Z# h
- for each in response.doc('.pic-list a[href^="http"]').items():
8 N1 o; C. v$ M9 r. \ - self.crawl(each.attr.href, callback=self.list_Caterg_detail,save=Cater_Name)
/ V3 u6 ^. }2 _0 |8 @ -
" m+ o* ` ^9 l* M% `1 X) X - def list_Caterg_detail(self, response):' ~- ~% g) P1 g* J) l
- Cater_Name = response.save3 ^, Q M% M( Z7 Y: G/ w5 u
- # print Cater_Name
9 l% L, b$ x5 K2 u+ d: d2 Z - Bookname = response.doc('h1').text()
8 G$ {0 p( z3 U# G3 x - print Bookname% J+ ?" b9 f9 k% \
- Book_author = response.doc('.authorname > a').text(). x* p. w$ K+ s/ N9 ~7 ~
- # print Book_author: [7 Q3 h4 |6 V4 { u4 @1 _4 {
- Book_Introduction = response.doc('.book-intro > div').text()% q, ]% k3 L: X* o( T) u
- # print Book_Introduction
* \3 t# {5 B+ T2 g. z - Book_Synopsis = response.doc('b').eq(1).text()
5 e, v: A: s o% D# H! x* Z5 k - # print Book_Synopsis
8 g$ Q% _ K8 A/ Y - Book_Palabras = response.doc('.booktitle p').text().split(' ')[1].split('|')[0]
8 k! @8 D9 ^5 M: V1 D# i4 O$ J - # print Book_Palabras
* \( X2 q7 u9 H5 j/ a - BookIDs = response.url.split("xiaoshuo/")[-1].split("/")[0] #小说ID2 @: u& a5 D! x0 y7 v
- # print BookIDs7 A& Y! ]( X- G' T) a9 }
- Book_Dates = str(datetime.datetime.now())
# t% Z0 v: m; m2 r' h - for imgs in response.doc('.bigpic > img[src^="http"]').items():
/ ]% e1 Q: n0 Z9 V. h2 \ - img = imgs.attr.src; C: E' L) r& F
- print img
7 K7 j" T: w7 z- H" k: l& q6 E& E - #小说封面下载
, l$ o! h- @% v* M2 [/ [ - extension = self.getExtension(img)" \/ c) T2 x ?3 `, G0 V
- name = self.getname(img)% r4 l' K, \7 z* M6 k) B4 j
- file_name = name + "." + extension
" j P4 Y, z# X. N - imgDir = P_dir + name
% Z! V, K) w5 u- D( p - Locaimg = imgDir + "/" + file_name/ A# O2 |$ D& E6 x
- print Locaimg
0 k0 M6 R; L- h* \0 H7 N) o' j - if(self.download(P_dir, imgDir, file_name, img)): #这2行可注译,图片下载到本地8 d0 v" [4 ^( G2 e N6 e
- print('attachment url is ' + img) #
9 C( `$ f( V* j: k* _/ z! a - Datos = { _" D9 A4 c `3 ^
- "Cater_Name":Cater_Name,/ v/ X) i- a4 T8 x( m
- "Book_author":Book_author,
+ s. k% n8 Q. i9 o; P6 R5 V8 ? - "Book_Introduction":Book_Introduction,- ~% z7 {# _, k9 Y
- "Book_Synopsis":Book_Synopsis,# n3 e, s7 e1 E$ y2 K
- "Book_Palabras":Book_Palabras,
! ^! `5 q0 K' o' t/ n/ n2 b' f - "img":img,) @; v e3 c) X3 i& s
- }4 v4 P( P6 _- a. J- U
- self.add_BookFile(Bookname, BookIDs, img, Locaimg, Book_Dates) #这行可注译,数据库发布接口,方便其他系统的发布
+ O3 a9 q& T( z; [ - for each in response.doc('div[class="bookbtn-txt"] a[class="catalogbtn"]').items():, k# b+ B9 }* l0 C
- self.crawl(each.attr.href, callback=self.index_page,save=Datos)
# N7 n1 F# a' H$ V$ Y) w: v7 I! v1 ? - 0 w; p. S* v% B9 {# |% b: Q/ Q
- @config(age=8 * 60 * 60)
5 _8 o1 S7 S, O$ }& U - def index_page(self, response): 8 `/ ~! C, C8 B( ]2 ~
- Datos = {
: x7 V7 l* ~9 N9 q; Q- ?2 o6 ]% A - "Cater_Name":response.save['Cater_Name'],
" }- J, ]6 k+ B - "Book_author":response.save['Book_author'],
* O, A( d, X% L - "Book_Introduction":response.save['Book_Introduction'],
+ u. n* N, m) R: k3 m0 `/ C - "Book_Synopsis":response.save['Book_Synopsis'],
5 f/ u0 x6 t- f# A0 q" ^. j - "Book_Palabras":response.save['Book_Palabras'],; p3 ?& a7 T3 q4 @
- "img":response.save['img'],
& `3 N- R) V" `* P' ~+ a2 h - }- U7 Z5 \, u9 [8 f8 @
- for each in response.doc('.chapter-list li:first-child a[href^="http"]').items():
6 b7 ?$ G. t7 v1 f; F7 ?9 D1 f - # for each in response.doc('.chapter-list a[href^="http"]').items():
4 P! Y: b+ j% m8 \0 ? - self.crawl(each.attr.href, callback=self.detail_page,save=Datos)
% l% \( c) [0 `2 V% i0 E; q - @config(priority=2)
) R! Z, F5 V, f2 R0 A2 W! S - @catch_status_code_error( n% W5 r3 e* s9 r7 E
- def detail_page(self, response):
- r- s- b+ o- G( I- |. ^9 f' w - NewRe1 = u'哈书'
# S* Q+ f, h) r" @% g) T. t3 F - NewRe2 = u'huhjsd.CC'8 Z& \2 ~* s$ J3 j: o( V
- NewRe3 = r'^\\n\\n'" K, y4 t9 f+ i3 Q1 o' l! ~8 B
- NewRe5 = u'小说网'$ Q2 U5 {) Z; M5 t5 _# M$ r
- NewRe6 = u'fgdfgf'
2 R6 s/ x5 x8 Y+ r - NewRe7 = u'fgfgf'
& F. X K M2 s% U% n- P3 { - NewRe8 = u'ffhgf'
* W8 X3 I4 f- V' z. y( y% F - NewRe4 = r'[\f\t\v+\.\{\(\)\}\!\/_,$%^*(+"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+'* y, b& O% U% ]5 O+ r
- ReC1 = u'静思'# U$ G0 P& y- s# F( E
- ReC2 = u'aghgf.com'
+ ^2 N( O2 P7 |" L- i4 I - ReC3 = u'aghgfh.com'
/ S, r& [% I1 ~. [4 Q0 g$ Y; J - ReC4 = u'', p. z- B& O8 q* ~
- ReC5 = u'文学网'
! V7 E3 M3 s! u( H; W! d. b0 V - ReC6 = r'<BR>'2 j. Q6 i0 }! }3 [
- Bookname = response.doc('.readlocation a').eq(2).text() #小说名称. n' {0 G6 {1 o" ]0 z O
- print Bookname
/ n! \" a* k, S5 W! l8 L5 H" X G/ T - Cater_Name = response.save['Cater_Name'] # 小说分类. _+ I" Z4 z: n L
- Book_author = response.save['Book_author'] #小说作者& I; g1 \; X6 u* f! n! C9 w
- Book_Introduction1 = response.save['Book_Introduction'] #小说简介
" A% ]7 z9 z0 ?) C& Q& `& U6 \( j - Book_Synopsis = response.save['Book_Synopsis'] #最近更新
6 S7 N: @* e% g- q _ - Book_Palabras = response.save['Book_Palabras'] #小说字数
& `4 R; p; z; d* p% ^: d - Bookurl = response.url #小说网址
3 n8 E; S1 | s# i* b - Booktitle = response.doc('.article-title').text() #章节名称- t' {7 I& x! N$ `8 W S1 e2 ~
- BookID = response.doc('.readset-r span').text() #小说ID E6 {7 j) L4 K2 `% \
- BookConte1 = response.doc('.article-con').text() #小说章节内容
# ^8 E+ l# q( L! b: X - abover = response.doc('.article-title').text() + response.save['Book_Synopsis'] + response.save['Book_Palabras'] + response.save['Book_Introduction'] #小说状态(连载还是完成): C) ~) u. p. ?( x9 O6 j
- Book_Date = str(datetime.datetime.now()) # 采集时间
5 G* ~% @ a' F6 z' t3 q1 Q6 g - BookConte2 = BookConte1.replace(NewRe1 , ReC1), w v, S; N; p- J2 H3 P
- BookConte3 = BookConte2.replace(NewRe2 , ReC2)
- @& X4 k0 U8 x" A - BookConte5 = BookConte3.replace(NewRe5 , ReC5)
% N' _/ U! v# A" S; c/ ?# k - BookConte6 = BookConte5.replace(NewRe6 , ReC2)+ @3 b" O5 R; g2 m
- BookConte7 = BookConte6.replace(NewRe7 , ReC2)5 @1 |' x& f) O- P7 O6 A3 n
- BookConte8 = BookConte7.replace(NewRe3 , ReC6)" P! u, U# a4 t& l. r; S+ @
- BookConte4 = re.sub(NewRe4 , ReC4 , BookConte8)
7 ]0 ^8 v+ ]3 L - BookConte = BookConte4.replace("\n\n","<br>")
4 J* T. E \: Q' {0 l* ~3 r - print BookConte
% z. p9 T- P) ]* d - Book_Introduction2 = Book_Introduction1.replace(NewRe1 , ReC1)4 u4 ]$ ]: u8 g9 [
- Book_Introduction3 = Book_Introduction2.replace(NewRe2 , ReC2)2 s2 I, N3 p: p5 ^6 u( W) z8 N
- Book_Introduction4 = Book_Introduction3.replace(NewRe3 , ReC3)* ^1 v; p0 p5 ~) a2 v
- Book_Introduction = re.sub(NewRe4 , ReC4 , Book_Introduction4)
3 V+ Q5 F- @) f5 u3 a - Titleid = response.url.split(BookID + "/")[-1].split("/")[0] ) `$ M0 ?$ A6 k/ r+ C
- Book_img = response.save['img'], #小说图片
# P3 }: ~) ]6 U - & ~% I0 \& S. v% Y
- #insert into MySQL 小说入库2 Z8 y: _7 P! k5 n9 N( G" B) B
- self.add_question(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布
8 G: X& P& h& ]6 _) }9 s5 ]3 K - self.add_comment(Bookname, Booktitle, BookID, Titleid, Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布7 ]8 v) X; F4 l1 t
- #post提交发布
8 `! G( h5 s; U. v* e0 ^ - self.add_locoy(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover) #这行可注译,火车头发布接口,不需要可取消; u# \0 ^* M2 ]5 o' Z" K5 E2 k
- Datos = {, s( {2 i) n" A# S6 j
- "Cater_Name":response.save['Cater_Name'],! M, J1 x. w# z/ @
- "Book_author":response.save['Book_author'],4 X) w3 V; E t
- "Book_Introduction":response.save['Book_Introduction'],
' G- Q h0 |( O1 F: d9 z4 d1 E - "Book_Synopsis":response.save['Book_Synopsis'],6 ?$ f$ [% Y3 u. l
- "Book_Palabras":response.save['Book_Palabras'],, ~. K; U2 e1 E# t& p% e
- "img":response.save['img'],
, C5 N1 c' u2 \$ K - }, k* M$ U! f5 M+ p! W9 g: `6 h, \1 F
- for each in response.doc('.articlebtn > a:nth-child(4)[href*="/xiaoshuo"]').items():
0 u& }8 `7 N$ [0 D& X* j8 N4 B - self.crawl(each.attr.href, callback=self.detail_page,save=Datos) , h9 J! J3 P6 g$ p+ m+ `0 k3 b9 F
- return {& }, {/ H+ f4 s- y7 i( m
- "Cater_Name":Cater_Name,+ g" @+ h2 [+ A( k
- "Bookname":Bookname,
7 d! e1 C9 Y7 Z: x - "Book_author":Book_author,
" E* C& c1 P- [ N) D4 G/ E - "Book_Introduction":Book_Introduction,
% a2 o8 z2 Z* ?- m z' u2 M - "Book_Synopsis":Book_Synopsis,6 W9 a1 i6 F% g' l7 P
- "Book_Palabras":Book_Palabras,
9 N; }" q! U, k - "Book_img":Book_img,% ] L% b: x1 @5 z. ]* W4 S
- "Bookurl": response.url,
6 O! [% J" k+ l6 n3 n- p' j, y - "Booktitle": Booktitle,4 ]8 S, t4 M& [. H% c ~
- "BookID": BookID,. ]9 {" c+ f8 h1 \: k$ ?
- "BookConte": BookConte,- A8 |8 K4 L: ]8 v
- "Titleid": Titleid,# C" W. ^3 l& A
- "abover":abover,
7 a3 |5 z1 T+ ?: W6 V - # "Book_Date" = str(datetime.datetime.now()),
0 X+ E- _3 L7 V0 @ - }
, Z) H5 |) N0 o }# R( J - def download(self, P_dir, imgDir, file_name, Book_img):
7 B/ ^# y; A9 v/ {% f - if not os.path.exists(imgDir): ) D( a6 f! c* L
- os.makedirs(imgDir)( A1 a I+ B% e
- file = imgDir + "/" + file_name
7 \# d6 u! m5 \3 O* q6 S7 U& K - # print file
) s5 ~% U$ F4 W( H% O3 i- ?2 P2 W - f = open(file, 'wb+')
$ v, `& W( l7 K( t% J5 m4 \* A2 P - imag = requests.get(Book_img)
' u7 r, K5 v# K2 h# D, m5 [ - f.write(imag.content)$ j* {+ _; `2 h. S* S7 F# H0 K# r6 J
- f.close()3 Q9 a+ ]& }7 J' D1 r
- #保存图片前
* m" F& [8 r$ j/ ~+ L/ B% ] - def save_imgs(self,response):* d( X5 V: `2 o* H/ X. s6 H
- content = response.content1 R0 E9 k4 `6 `
- file_name = response.save["file_name"]: o, r4 U* X& s) x
- imgDir = response.save["imgDir"]
, {# m/ F' L2 g$ L5 u - file_path = imgDir + file_name5 A- p4 H2 y6 D3 F; h
- self.save_img(content,imgDir,file_path)7 F; ^5 ]" K; T% b
- #保存图片
! ]- i3 i6 O- j+ b% S - def save_img(self,content,imgDir,path):
% t, o7 C6 [' ~7 i6 Z - if not os.path.exists(imgDir):
% V7 ?5 ], ?# z! x$ S$ Q9 l - os.makedirs(imgDir)
3 S* `8 d+ u1 a1 u - f = open(path,"wb" )7 g) |" r7 h4 y4 S+ Z
- f.write(content)/ j- P% ?' r+ y8 K
- f.close()
0 ~% Q' p+ A+ X& ] J2 D! L - #获取url后缀名( l) t5 J' T9 [$ J# N
- def getExtension(self,url): 1 q& J, ]. F% W( J, a& ~1 L* L; h
- extension = url.split(".")[-1] V, J. q A1 ^5 M8 o
- return extension * F* p* c8 p- z7 T& e! Z9 O
- : l+ A. M; X- r c% I, Q
- #获取图片名
7 @0 J. A* u6 T - def getname(self,url):9 O% z6 X: E, u& Z. n; B: Q
- name=url.split("/")[-1].split(".")[0]
6 U0 b3 I( w0 G - return name
复制代码
) o2 v8 ~- o/ Y0 q2 |- L: B2 b
5 N6 U. l5 f- k8 v |