Python + pyspider某小说站的爬虫,入数据库,火车头发布,资源下载到本地,另可写爬虫!$ G, u( j/ t# K
- #!/usr/bin/env python/ X! ?6 A& m7 H
- # -*- encoding: utf-8 -*-
' [1 P+ g, s ?, ?/ O8 I i - # Created on 2019-05-05 21:43:11% }4 C! k2 O" z1 W- p
- # Project: XiaoShuo4 g$ O( P$ C, h" Q: r& ~4 ]" w
-
: M7 m9 w; l2 s _0 l# E' i - from pyspider.libs.base_handler import *5 r" h4 r2 u) b4 \$ H
- import pymysql
. \ R! p. a) p4 ^+ y - import random+ k6 H9 O* A& d/ C6 ~5 q- V6 a
- import datetime, H8 V6 o: K0 `/ n; h% g
- import urllib2,HTMLParser,re/ f% Q6 k' Q! X6 v& o; I# w
- import os
. z j/ Z; r# g; X - import sys* }! L: K2 i1 C. h, n& w+ _
- import re; g$ h% b5 ^" |9 D7 W
- import codecs
! w; L& E8 @# x - import requests
P- {4 J. L( i: R7 g) C( I - import json
; @3 \7 O% a* M) r -
! T) @* v1 ^" Y7 ? - class Handler(BaseHandler):
/ o3 J [3 Z8 @3 y8 b - global Datos3 g/ } i6 Q( j
- global P_dir
- F3 O8 }9 O! r - P_dir = '/Tools/Debug/' #采集时候图片保持到本地的路径
4 y- k V+ g; b: w6 d4 ~ - global Datos, A; Y9 s6 Y7 ^
- Datos = {}1 E9 K8 H B2 S4 A: o8 s: M
- headers= {3 P- i0 O7 ?* n8 \. u& M
- 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
8 Q% X; A ~; F0 c - 'Accept-Encoding':'gzip, deflate, sdch',
" M& R2 }1 k. j) S1 I; _ X - 'Accept-Language':'zh-CN,zh;q=0.8',9 h/ A& \6 U; M, h% v% x
- 'Cache-Control':'max-age=0',
8 ~; b, K5 b& n$ |- r - 'Connection':'keep-alive',7 W" L7 O$ V5 K! {
- 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'& g9 L, G% y: `! @
- }
2 s4 l( @. O1 W2 l8 v s - crawl_config = {1 N- f7 Z+ v- |8 P3 u. @
- 'headers' : headers,
8 X6 l. \- g/ D- o! E" Q2 ~. | - 'timeout' : 300- u p% n0 t& q: V
- }2 O& L8 r# @& ~: C) z
- def add_BookFile(self,Bookname, BookIDs, img, Locaimg, Book_Dates):" p" @4 z% {! D4 Z( \) o! x! F/ M
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
. ?1 r6 O( o+ r! |3 | - try:
0 [9 j9 C0 d6 Z - cursor = db.cursor()$ `! l1 i8 W8 ~4 a# y8 R1 O9 i
- #注意此处字符串的占位符要加双引号"%s"
* h7 J0 B7 h: Q$ r - sql = 'insert into BookFile(Bookname, BookID, img, Locaimg, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, BookIDs, img, Locaimg, Book_Dates);
, \( ^3 I) f- q" |0 a$ g, Z - # print(sql)
% f: t, `0 l' v - cursor.execute(sql)
+ d' h, M% c j7 `# l& h; H# Z - ' S- d( P/ S# N2 \, k
- #qid = cursor.lastrowid
4 }* {+ ~* v- z# d - #print(qid)+ v- r* f8 S5 G% Y" }/ ?- p/ `& m
- # ]2 D8 ]2 F0 b7 n- \
- db.commit()- c1 Z+ Z s- E/ G0 Y; b' B! m1 {
- except Exception as err:
/ G7 R( R* [9 {9 z - print("Error %s for execute sql: %s" % (err, sql))
$ M' ?+ k9 A! C# R: f+ F - db.rollback()
- ?) P0 i* F& ^: v( d8 m* K2 p" d - def add_comment(self,Bookname, Booktitle, BookID, Titleid, Book_Date):1 d, Q0 J1 b. g
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
2 L3 T7 ^8 T4 n8 S - try:8 D% F# q9 ~6 p$ X" \
- cursor = db.cursor()
, ^% o; M9 _8 a0 y9 I - #注意此处字符串的占位符要加双引号"%s"
9 l2 F1 W( J- m N7 \9 Y, J- m - sql = 'insert into BookTitle(Bookname, Booktitle, BookID, Titleid, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, Booktitle, BookID, Titleid, Book_Date);( T. s3 ?+ B8 H, L- c) D7 ?
- # print(sql)3 g# d6 e% [' H. a
- cursor.execute(sql)" N# f4 u9 S e- ?* p
-
4 j% x) B5 e+ c5 f$ h! j4 v ^" L - #qid = cursor.lastrowid8 }6 Q6 H" }# d8 [$ \) c
- #print(qid)+ @) r/ w3 I% p" v
-
( x( Q( M2 S5 _/ ` - db.commit()0 o9 ~' d7 Y7 [9 [3 p
- except Exception as err:
8 G6 m# r% X7 d3 Q7 F y' \- W" A7 v - print("Error %s for execute sql: %s" % (err, sql))
0 O4 K& @# j# _# @( s1 ~ - db.rollback()7 e" l+ m' M. M
- def add_question(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date):+ @/ h4 e: s3 R4 A6 M. @% p% r
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")6 ?: t1 ~/ D! Q
- try:
8 c$ w6 J# J; Q9 b% A1 P) ~ - cursor = db.cursor()8 g# A( k: M( Z1 k
- #注意此处字符串的占位符要加双引号"%s"+ t: w5 R" r' z6 x' z" a+ [
- sql = ' INSERT INTO BookConte (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date);
/ v7 k; y" C5 z, b$ ^6 I - print(sql)4 ~- N7 q6 s# e7 q0 }. z
- cursor.execute(sql) q; C- E- U( d4 X8 q
- print(cursor.lastrowid)
( K2 l ^4 Z/ m; s - db.commit()' ] [+ \, r6 w
- except Exception as err:
$ j8 ^( t. i6 q( ] `- N - # except:
. l6 G* u+ D# H& p - # print('Failed')8 o5 {0 \$ P1 m/ I# x
- print("Error %s for execute sql: %s" % (err, sql)): Q; m( t/ o6 L$ `; T7 Y3 H" }
- db.rollback()
. c5 _- p1 C" |2 s$ v -
1 S' P3 u+ K9 c: `1 C7 i8 T - def add_locoy(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover): y% S1 D6 w) B$ \
- reload(sys)
+ Y0 [+ R/ Z8 G - sys.setdefaultencoding("gbk")& C) b+ W2 X/ Z& y* `4 _7 X
- locoy_url = 'http://www.******.net/locoy/?my=book' #697火车头发接口地址
, ]7 D) M" U# R& v' G3 n4 ] - locoy_data = {) K. ]. H/ U3 t# H. Y: J
- 'my_u':'用户名', #后台用户名
: g3 ^8 s, H# {: T8 r0 M - 'my_p':'密码', #后台密码1 R+ @- b' V( u8 b3 h+ J2 T: ]
- 'subject_669977_net':Bookname.encode('gbk', 'ignore'),
; O3 Q0 ], b, a; O" n - 'caid':Cater_Name.encode('gbk', 'ignore'),
. p8 w/ Y. Z: S6 i1 X$ G - 'title_669977_net':Booktitle.encode('gbk', 'ignore'),
, s, j8 h( E8 ?7 c - 'article':BookConte.encode('gbk', 'ignore'),7 L2 B1 R7 y! s2 x$ B( x
- 'author':Book_author.encode('gbk', 'ignore'),
$ f: _, g! m- S2 _+ ~# @' x - 'ready_1':Book_Palabras.encode('gbk', 'ignore'),
u; k, h6 b) n+ ^7 d - 'thumb':Book_img,; q& E/ v: t$ K/ l
- 'content':Book_Introduction.encode('gbk', 'ignore'),6 p% ]) O, ]* H
- 'abover':abover.encode('gbk', 'ignore') 8 J. _) r/ {+ G, e5 y g: Q/ n% W" [; Z
- }
z4 k3 B4 H) h/ [0 j - res = requests.post(locoy_url, data=locoy_data)5 {. e" \5 r+ P' u9 n& E
- print res.text
# R, U' U! r# X9 m4 f - print res.content, g; {9 y% v( F' K9 _5 p
- # print Dsd
" _4 z5 E0 u; K( U h - return res
$ D5 L' D- T1 ^) O8 A - $ ]" Z/ U- C6 I: n1 \
- def __init__(self):' H, q6 R- J- t$ a5 v! b2 e9 j
- self.base_url1 = 'https://www.****.cc/') |# v9 a' [" _9 o
- self.base_url2 = '/'
0 C9 f; s9 s2 m3 |8 v9 L0 x - self.CaterId = []2 e! s: \$ M* c* @; {
- self.CaterIds = ['xuanhuan', 'wuxia', 'yanqing', 'lishi', 'kehuan', 'kongbu', 'nvsheng']- @$ B+ j; J; Z t9 `
- self.page_num = 1
2 G: ~3 C6 W9 w/ B) H7 O6 M0 E8 A - self.total_num = 200
" M( _$ d# v8 R' ]" H - 7 ^7 m' N$ ?# Z
- @every(minutes=8 * 60)4 Z* S/ s$ H9 M% \' Z0 a% e
- def on_start(self):' k% X6 z3 S+ U5 N0 o! {
- global Cater_Name& S; i4 J3 r) J# S& { [7 I+ X
- Cater_Name = []
8 e( W k2 m, V$ } H - while self.page_num <= self.total_num:
6 Q8 d! `* j8 b3 q2 A) e - for self.CaterId in self.CaterIds:+ w' f$ u/ Z0 Y
- if self.CaterId == 'xuanhuan':
0 `9 G8 B& S2 J2 R - Cater_Name = '玄幻'9 C. j6 E( W! z$ d" {
- if self.CaterId == 'wuxia':* g4 u0 U: n* K0 _ H
- Cater_Name = '武侠') c9 Z! O- b$ }3 i! L ]& p0 T2 b
- if self.CaterId == 'lishi':, C* j2 \7 F/ M5 I+ [- Z; {
- Cater_Name = '历史'
, }9 h# G4 U. Z) `, }2 h$ F- U - if self.CaterId == 'yanqing':5 t4 m( A( b2 h3 L0 u W
- Cater_Name = '都市' 2 ^1 h7 q7 ?" g$ Q/ L: E
- if self.CaterId == 'nvsheng':
9 [) W% J6 V1 k9 f; B3 B - Cater_Name = '都市'
1 q. p7 R. g+ f/ F& X/ }: i - if self.CaterId == 'kehuan':
7 [+ t1 P/ P, Q! o* I - Cater_Name = '科幻' - C* \+ D$ l0 G* s p% Q
- if self.CaterId == 'kongbu':0 J- b- b1 Y% k
- Cater_Name = '游戏' ; V. j( P& ^% Y' ]" D5 U. N' {
- print self.CaterId% I' s9 g5 P w% u
- url = self.base_url1 + str(self.CaterId) + self.base_url2 + str(self.page_num) + "/"
( v0 B& d- z# H& P: D+ O; y& q, P- P - self.crawl(url, callback=self.list_Caterg,save=Cater_Name)
" [9 s3 R4 B7 ~3 f7 x4 J - self.page_num += 1 . [% G& Q, [" H. ]
- 0 ]7 f3 p" R" a5 }- h$ o
- def list_Caterg(self, response):
# y4 Q. w' N- K- T' i - Cater_Name = response.save
& O- s5 \( }* @8 u8 }4 C$ C8 | - for each in response.doc('.pic-list a[href^="http"]').items():
4 `7 T: W# i" ` - self.crawl(each.attr.href, callback=self.list_Caterg_detail,save=Cater_Name)% ~7 a/ g% v, n" S
-
) ^& @. d0 \- G( k2 x - def list_Caterg_detail(self, response):
3 k% l8 W4 N, w: @ - Cater_Name = response.save# T( k" l. J6 x5 Z4 O- q' H
- # print Cater_Name
6 h" L/ x n' F1 D4 l# b - Bookname = response.doc('h1').text()
5 m, o0 s, ]/ A6 e3 t" }! M - print Bookname
2 D! N) w7 S1 P& N9 | - Book_author = response.doc('.authorname > a').text()
5 V4 o$ _8 Z: L$ A- L o, n - # print Book_author9 ]. M, i9 Q" O1 \0 _
- Book_Introduction = response.doc('.book-intro > div').text()
. F3 g6 W/ ^) W4 f. I( M: X - # print Book_Introduction1 k+ h) g1 T; o! x* i7 c' S# s
- Book_Synopsis = response.doc('b').eq(1).text()
7 n E6 R8 _. p9 N - # print Book_Synopsis
7 W1 w. i2 \9 n3 A$ ~# e/ i1 I - Book_Palabras = response.doc('.booktitle p').text().split(' ')[1].split('|')[0]
7 B/ B, m0 i7 L7 N9 J3 j - # print Book_Palabras
9 I) B5 e- m( a, ~ R - BookIDs = response.url.split("xiaoshuo/")[-1].split("/")[0] #小说ID
1 H0 W3 v7 w c% \ - # print BookIDs, n# I# Q9 H) q
- Book_Dates = str(datetime.datetime.now())
8 @5 p: w' \# j# \7 u - for imgs in response.doc('.bigpic > img[src^="http"]').items():9 W) |. V* I! I% B
- img = imgs.attr.src
2 h, V' O' e) `/ {: J8 S- @ - print img
6 }" `& F9 k9 G& W/ w - #小说封面下载
1 e4 e! w* Y. L& o6 f/ E# U - extension = self.getExtension(img)
, o7 ?8 y, K9 B4 E6 A( o4 k - name = self.getname(img)
$ ~2 r/ P; n9 Y% D O6 b - file_name = name + "." + extension) N' E+ y; w3 l% p
- imgDir = P_dir + name
2 T- Y( j) H1 d- x - Locaimg = imgDir + "/" + file_name
6 Q: v+ O" T7 C3 {2 E8 }2 B - print Locaimg
7 r$ Q$ v) I/ ]" G! G - if(self.download(P_dir, imgDir, file_name, img)): #这2行可注译,图片下载到本地
# E2 k. t/ g4 f# n - print('attachment url is ' + img) #
/ A+ b( J ]% N0 ], _8 ]# O1 I7 e; L - Datos = {
( ?: Y: i% w! M8 h; g$ Z. N- { - "Cater_Name":Cater_Name,
" j5 {' m/ l; L3 Z- ~) v8 a0 u - "Book_author":Book_author,4 G7 O5 W$ u: P, N) N% ]2 o" @
- "Book_Introduction":Book_Introduction, ^1 o% R+ {, b1 q" Z% |. b2 V& _
- "Book_Synopsis":Book_Synopsis,
. G& s, m7 t# D# P - "Book_Palabras":Book_Palabras,
8 a8 n* p" q2 s7 V* L" o9 v; { - "img":img,$ B5 G- z& d7 j, i0 ^$ v5 C
- }
8 k- G5 s c' O% x - self.add_BookFile(Bookname, BookIDs, img, Locaimg, Book_Dates) #这行可注译,数据库发布接口,方便其他系统的发布9 O; p, @5 g/ r& _* N* H5 h$ X
- for each in response.doc('div[class="bookbtn-txt"] a[class="catalogbtn"]').items():
) ~+ M! x' T& H, h+ Q - self.crawl(each.attr.href, callback=self.index_page,save=Datos), n. [, L' j2 [, Q- O8 J
- 0 ^9 l( @" E' v$ V7 S
- @config(age=8 * 60 * 60)
& K6 w2 l* Q: o) W% ]5 X - def index_page(self, response):
( t& b' b# a+ R) @6 D$ g - Datos = {8 x1 q4 D' ^" }5 P: p
- "Cater_Name":response.save['Cater_Name'],; d# I; l4 i5 }# u8 v i& w" w
- "Book_author":response.save['Book_author'],9 @/ C* j0 b+ M: o
- "Book_Introduction":response.save['Book_Introduction'],
0 N+ k2 @6 J; ^# \5 C. M$ W, \$ G5 E - "Book_Synopsis":response.save['Book_Synopsis'],
6 u8 I" S# P: K$ e - "Book_Palabras":response.save['Book_Palabras'],
$ v: }! l5 P+ q) `- t9 D+ ` - "img":response.save['img'],% L1 m5 o. \7 `
- }! {3 d9 J: s9 r C$ a9 ]; M0 E
- for each in response.doc('.chapter-list li:first-child a[href^="http"]').items():
0 w+ k* A; m6 [/ ?. N - # for each in response.doc('.chapter-list a[href^="http"]').items():
8 D- b/ D8 d9 N5 H - self.crawl(each.attr.href, callback=self.detail_page,save=Datos)
8 y3 s" m! U* g& K6 U2 B' C - @config(priority=2)
: N% Q+ W7 a) h8 L5 a - @catch_status_code_error2 B; K6 s& b, f8 v) r0 J
- def detail_page(self, response): % ? I* @3 H! A7 V7 i; R5 r/ a5 ?
- NewRe1 = u'哈书'! W4 `' o9 V- ]; r3 i, o) b9 u0 ?
- NewRe2 = u'huhjsd.CC'
R5 @% D7 m# ?' U) R8 S" [ - NewRe3 = r'^\\n\\n'
" E8 M: E6 h4 e/ ^5 l- k - NewRe5 = u'小说网'& _# Y9 f2 w9 {: k
- NewRe6 = u'fgdfgf'' \0 N( L# H+ s; ?# A
- NewRe7 = u'fgfgf'
2 U9 Q# v, [1 }$ e9 k/ C( ? - NewRe8 = u'ffhgf'' O# |/ \6 Q7 j
- NewRe4 = r'[\f\t\v+\.\{\(\)\}\!\/_,$%^*(+"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+'
8 k/ S2 H: C1 d7 V - ReC1 = u'静思'
( V p# l& x' C; M2 ^9 F" P6 U4 p - ReC2 = u'aghgf.com'0 R4 j# m! m J$ y# P9 W) ^* ?
- ReC3 = u'aghgfh.com'4 m1 E! h9 T! o" u5 b0 B( P
- ReC4 = u''2 X/ I2 l! V3 G8 L" @
- ReC5 = u'文学网'
$ g: ~$ q0 c* \1 B+ s9 ~ - ReC6 = r'<BR>'
. j1 l0 ?7 l: {+ \ - Bookname = response.doc('.readlocation a').eq(2).text() #小说名称4 Y& \5 I; t. a
- print Bookname
- s- b9 @/ B2 A - Cater_Name = response.save['Cater_Name'] # 小说分类
- v3 s& O# T# U {# i; @ - Book_author = response.save['Book_author'] #小说作者
7 i& z* _- y( w2 S, L( B - Book_Introduction1 = response.save['Book_Introduction'] #小说简介
- K4 ]$ e& \6 ^: v - Book_Synopsis = response.save['Book_Synopsis'] #最近更新
- o: c* x7 B; x1 U - Book_Palabras = response.save['Book_Palabras'] #小说字数
( q0 ?4 V4 t( _% q i4 U - Bookurl = response.url #小说网址; x: U; b! l: x' r, _
- Booktitle = response.doc('.article-title').text() #章节名称
- V' w3 y, h5 O" I - BookID = response.doc('.readset-r span').text() #小说ID
% A) o+ x6 m* u5 g$ k8 G - BookConte1 = response.doc('.article-con').text() #小说章节内容% |1 ^4 v0 G/ p! I) f* S
- abover = response.doc('.article-title').text() + response.save['Book_Synopsis'] + response.save['Book_Palabras'] + response.save['Book_Introduction'] #小说状态(连载还是完成)
# u5 @7 x* O; H% W* O$ `3 D - Book_Date = str(datetime.datetime.now()) # 采集时间3 X, R# @, I! Y2 n r/ a0 ^* a
- BookConte2 = BookConte1.replace(NewRe1 , ReC1)
; Y" z: b' O, u - BookConte3 = BookConte2.replace(NewRe2 , ReC2)4 y" G' }- @& m& y \
- BookConte5 = BookConte3.replace(NewRe5 , ReC5)2 m# j0 n9 d+ G# x5 N! U4 D. L
- BookConte6 = BookConte5.replace(NewRe6 , ReC2)4 t8 L0 |! E5 g' o8 ?3 B
- BookConte7 = BookConte6.replace(NewRe7 , ReC2)
- R* N: J0 c" K8 t$ n) ^ - BookConte8 = BookConte7.replace(NewRe3 , ReC6)$ q1 ^. v$ o& \+ k3 I: r. F
- BookConte4 = re.sub(NewRe4 , ReC4 , BookConte8)
# E1 l% L1 N' E, `% W - BookConte = BookConte4.replace("\n\n","<br>")
1 I* i5 I+ v6 v$ b# Y. y" u - print BookConte; p, C4 M3 H. j# q
- Book_Introduction2 = Book_Introduction1.replace(NewRe1 , ReC1)% W D' W9 _% z* o( d
- Book_Introduction3 = Book_Introduction2.replace(NewRe2 , ReC2)$ Y: a( S- D1 m* _
- Book_Introduction4 = Book_Introduction3.replace(NewRe3 , ReC3)
* I( r( }. t$ G, Q D: k8 a8 } K - Book_Introduction = re.sub(NewRe4 , ReC4 , Book_Introduction4)0 F0 p4 t/ a' I. E6 S
- Titleid = response.url.split(BookID + "/")[-1].split("/")[0]
0 T0 J2 n+ ~9 @ - Book_img = response.save['img'], #小说图片
1 R( _# g `( ]/ t: H" E - ) d) {# t% B6 W9 E K: X8 P
- #insert into MySQL 小说入库" \$ M$ o( H7 T9 j( R& {& G+ d4 o) Q5 X
- self.add_question(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布
7 `& ~% c; u2 J/ Z: Y; S - self.add_comment(Bookname, Booktitle, BookID, Titleid, Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布- x6 @% b# x1 \/ [
- #post提交发布
3 O( T9 V! s+ `: w - self.add_locoy(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover) #这行可注译,火车头发布接口,不需要可取消0 b( y, Q, C4 ~/ f4 Y6 ~
- Datos = {& ], S- N; i! ~, K' S1 V
- "Cater_Name":response.save['Cater_Name'],' i. g% v! S; P5 X5 M
- "Book_author":response.save['Book_author'],0 P/ c7 U# w8 j0 u. P6 y% k
- "Book_Introduction":response.save['Book_Introduction'],- v/ @* E# D" Q7 @: S( k
- "Book_Synopsis":response.save['Book_Synopsis'],3 r: c8 j1 n2 H! N
- "Book_Palabras":response.save['Book_Palabras'],
3 A+ P- U4 N2 D1 O - "img":response.save['img'],0 L9 i; A1 K L. Q) p- m2 @
- }4 |+ ^6 Y7 Z* W- C. {( n
- for each in response.doc('.articlebtn > a:nth-child(4)[href*="/xiaoshuo"]').items():
- L+ y5 F+ C6 E, J, n4 g, r - self.crawl(each.attr.href, callback=self.detail_page,save=Datos)
' `% N" {. r3 T5 f( G5 ] - return {& g, L6 Y& _' v) C
- "Cater_Name":Cater_Name,
+ S# S y: S& O$ H2 ?) Z5 T) t - "Bookname":Bookname,
! R5 e+ T$ K) n6 b - "Book_author":Book_author,
0 j, f+ l+ d5 t& p - "Book_Introduction":Book_Introduction,- }2 o6 X+ r0 ?2 F
- "Book_Synopsis":Book_Synopsis,5 N+ O8 [9 a: t1 o0 l5 z
- "Book_Palabras":Book_Palabras,# Y. |8 }7 j) v! [# p" |4 \3 e" K
- "Book_img":Book_img,0 o$ l; L' S4 q0 a+ x) Y2 ~8 }+ b
- "Bookurl": response.url,4 S) V P0 i6 h* M
- "Booktitle": Booktitle,
) J; ?8 C+ L7 p; E - "BookID": BookID,/ S3 B! k5 K% s6 q: K
- "BookConte": BookConte,. H! ?* G( z3 C7 X) d( w( e' g- @0 O4 T
- "Titleid": Titleid,8 |. r6 C* P* Y! q
- "abover":abover,7 @" W% [* d0 s7 P
- # "Book_Date" = str(datetime.datetime.now()),
& v( A$ U7 V8 O- `% J: J, c1 [ - } O" D: V* J o) H7 _
- def download(self, P_dir, imgDir, file_name, Book_img):
1 ^ i, z4 G, x* g8 L( @ - if not os.path.exists(imgDir):
. p& R* O$ b( J+ l- N - os.makedirs(imgDir)! w) q) v$ m5 P) _7 d) l, Q
- file = imgDir + "/" + file_name; h6 S6 z0 p9 i, T( j0 W
- # print file, P6 b' M7 K. x- s N; C- Y
- f = open(file, 'wb+')
% E) X# |' `: O - imag = requests.get(Book_img) 0 B4 I6 U. n/ a8 t! [: k x5 `
- f.write(imag.content)0 ~4 R# y" P e7 c
- f.close()6 p- J5 U# F" y+ f( w
- #保存图片前
/ ]* V; `4 I: ]: U7 s4 G* z - def save_imgs(self,response):
8 K0 P% K3 t7 E$ O5 h3 t: M0 Q% B - content = response.content
6 K9 w& E7 `: o7 U - file_name = response.save["file_name"]; o- _! c2 F% r1 M. v5 \
- imgDir = response.save["imgDir"]
3 {6 {' x, Y: W$ E5 H - file_path = imgDir + file_name5 q/ M% b) W, h4 D
- self.save_img(content,imgDir,file_path)& d2 a& c4 J* G- K7 k4 Z& x
- #保存图片
& ?( `- F* d! \, } - def save_img(self,content,imgDir,path):0 i+ |" \( K+ q1 k3 e
- if not os.path.exists(imgDir): 0 C4 E' p7 b$ \3 Y! t; ?
- os.makedirs(imgDir)
8 u% j' @9 F4 P# z - f = open(path,"wb" )1 m- {' Q5 K8 Y; ?. P
- f.write(content)0 D& n7 }5 _, t; \# H* m) [2 f
- f.close()9 Y p, D! b' z$ ]# K
- #获取url后缀名% H* ~% W" D8 R
- def getExtension(self,url): , O& k, R0 U0 B$ P3 _0 ~
- extension = url.split(".")[-1]
, X) y) \0 c. j, L - return extension # Y+ C4 o/ T; k3 q" ], r
-
' ]/ H' [7 c L2 f1 ^1 ?6 G6 z - #获取图片名, @4 l' {( W1 |) [ A
- def getname(self,url):8 h4 I6 x! _2 L- m: Q7 d' R
- name=url.split("/")[-1].split(".")[0]% f* [/ f' e6 v$ v- k7 Y
- return name
复制代码 ( a8 O# \$ r; g, H( z8 P+ N
. v. x3 b! P: ]0 P4 Y" B G |