Python + pyspider某小说站的爬虫,入数据库,火车头发布,资源下载到本地,另可写爬虫!
' n/ j% v+ ?6 ^8 d* M- #!/usr/bin/env python; M( R! C6 L( ]# ?
- # -*- encoding: utf-8 -*-( j0 ]& _' H3 J8 c6 q& H
- # Created on 2019-05-05 21:43:114 \9 f4 E1 X5 c5 c& s( R1 R7 J
- # Project: XiaoShuo* v" p' I- J* ]9 R
-
0 L d/ A* c6 S, N+ w; ]6 x# y - from pyspider.libs.base_handler import *
7 s- X, x1 |/ H3 X - import pymysql+ ~8 [! G+ ] d# h
- import random
2 }/ q5 E6 G2 T) |$ | - import datetime
8 o- a; V$ q0 N8 r# S) u+ I- ^ - import urllib2,HTMLParser,re
" A$ O" {- ~9 S - import os
8 O. z, U: E2 U) _5 R. U+ K% U - import sys/ |' b! r9 y' D' J# u
- import re; y! F" y% C0 T6 c; N$ [
- import codecs
) X8 |: b1 K- g - import requests3 d- h$ N' P' v& x4 i" f
- import json
, X+ Z2 B! S/ M$ e$ Y9 W! G* r" \ -
) ^( r1 }# n& H+ e - class Handler(BaseHandler):
5 K3 C s' S" F6 h. W - global Datos
2 R" c8 M( F& Q& n x4 @6 i* ~ - global P_dir
# i$ V6 z" {( P5 X - P_dir = '/Tools/Debug/' #采集时候图片保持到本地的路径
9 h, u6 j9 o# ~ - global Datos
' r2 {8 U4 ~# f! `" G, T - Datos = {}3 d' H3 M. M7 p; Y) S# |3 V
- headers= {) a* `" H" d! g" j
- 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',- S3 a# u- i* ]# I5 \
- 'Accept-Encoding':'gzip, deflate, sdch',% T# L" X0 g! @
- 'Accept-Language':'zh-CN,zh;q=0.8',
3 @$ q* q6 j6 P/ E2 H4 a; d, X - 'Cache-Control':'max-age=0',
" Y% [, h: o5 ~3 l9 p* |1 K - 'Connection':'keep-alive',' v8 l _4 T2 Z1 u" N1 N5 q1 Z7 v0 q7 @
- 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'; q/ A* {+ H# t; j/ U' q
- }
) }/ I4 H( R6 J, E - crawl_config = {
9 c5 z6 l. B: W - 'headers' : headers,4 s6 V1 B& B: y4 S
- 'timeout' : 3005 X- o" `, s7 ?8 ^0 B" N. C
- }
& k1 B! p# I: | - def add_BookFile(self,Bookname, BookIDs, img, Locaimg, Book_Dates):
/ ?5 E3 H" {2 v/ [- L - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8"). E. x6 ~: @( ~, O
- try:
0 c$ ]& @/ H, s/ b6 ] - cursor = db.cursor()
+ P* e, z% y% B0 t" P - #注意此处字符串的占位符要加双引号"%s"
3 }- m: F7 ~3 W0 J0 l& D - sql = 'insert into BookFile(Bookname, BookID, img, Locaimg, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, BookIDs, img, Locaimg, Book_Dates);" }- Q- g$ t3 |
- # print(sql)4 s( g4 R7 F# d5 y
- cursor.execute(sql)
6 o, O2 H7 |* h: C5 v7 x8 s -
# V" u, E5 d) }: v7 P - #qid = cursor.lastrowid
( q7 q; @3 g3 o: }2 \5 O ` - #print(qid)
L- H8 j0 X; J3 l. L- f5 ?) M - 5 C9 e7 c; p$ I6 o
- db.commit(). N% Q! E, p# M$ V; J9 r! F
- except Exception as err:4 ^' g/ [/ R% \* i' f' H, x
- print("Error %s for execute sql: %s" % (err, sql))/ I- b2 G% W2 k1 w
- db.rollback()# r' m" C* N4 Y
- def add_comment(self,Bookname, Booktitle, BookID, Titleid, Book_Date):% z$ B/ K2 y6 A8 B
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")# d8 _7 }! t L% R
- try:4 p2 _# M. H Q9 Y$ k5 }
- cursor = db.cursor()
" X& ?" C" J4 o" h4 e4 J* G9 [ - #注意此处字符串的占位符要加双引号"%s": J: |3 `3 d! K: l2 H
- sql = 'insert into BookTitle(Bookname, Booktitle, BookID, Titleid, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, Booktitle, BookID, Titleid, Book_Date);
?, Q- R: n* Y* h' o - # print(sql)9 @. n- u6 U& f; J; A" B
- cursor.execute(sql)1 u% d+ G/ K I5 c1 a) `
-
; B( z" z2 L$ _7 ^$ s; f - #qid = cursor.lastrowid! V l/ ^# N; h
- #print(qid)/ U6 W, B2 h/ y! [: l
-
! j, x0 X) R. z+ a7 G+ t - db.commit()
' o+ F8 }# Z/ s5 X: G- y - except Exception as err:
3 B9 {. R6 k' E - print("Error %s for execute sql: %s" % (err, sql))- K7 J* ~7 P" ]! ]8 }( P: I' g* @* r/ J
- db.rollback()+ e, j7 P$ o. e+ e" R' f0 J
- def add_question(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date):
. o& h/ }0 ]- _: {. P - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
u$ o, u- g! N8 W0 y5 A) X6 J& S - try:
: r9 [/ U6 c2 [ o4 l - cursor = db.cursor(), z9 L% l) T7 X: H2 ^$ T
- #注意此处字符串的占位符要加双引号"%s"1 ] F; x6 U* [
- sql = ' INSERT INTO BookConte (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date);
' |& O/ f; v0 c( z7 J - print(sql)
$ ]" ~ V Z7 L% L" J4 y - cursor.execute(sql)* {9 H, W/ G7 h0 q1 |+ L+ S" }
- print(cursor.lastrowid); D' A* J. Z( \9 F& `1 [2 z6 I
- db.commit()
- \. F& U2 e) Y6 j# ^# z6 P: g7 O - except Exception as err:
7 b$ a4 D: K) `) z- Y6 K- J - # except:
1 e% k$ v q* w: T" H$ \ v - # print('Failed')
( ^% H- A5 I* L7 V8 o% m* O - print("Error %s for execute sql: %s" % (err, sql))
0 l1 G, y$ H0 M: d - db.rollback()( d" l& k! X* ]5 l2 F
-
+ Q5 j6 y! _, o/ e# R - def add_locoy(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover):
9 [. |9 J6 o" P5 c" N7 U - reload(sys)8 \5 f2 p/ ^$ @$ x& y
- sys.setdefaultencoding("gbk")
$ \0 N1 U8 z8 q# k, } - locoy_url = 'http://www.******.net/locoy/?my=book' #697火车头发接口地址" F9 m" Y5 Y: \4 K( b4 @! k
- locoy_data = {
( ?$ M' J2 M4 z. c - 'my_u':'用户名', #后台用户名
6 ^+ k; p9 P; E) y& h7 p3 g: Z - 'my_p':'密码', #后台密码% D8 v. ~" _2 v# z
- 'subject_669977_net':Bookname.encode('gbk', 'ignore'),; |* Q* d5 W/ J8 O
- 'caid':Cater_Name.encode('gbk', 'ignore'),
4 U/ J( r2 C6 T+ I- z! W; _( I9 j - 'title_669977_net':Booktitle.encode('gbk', 'ignore'),
' a2 r* `0 C" c - 'article':BookConte.encode('gbk', 'ignore'),
7 V: P5 b, o2 Q1 x - 'author':Book_author.encode('gbk', 'ignore'),
, h7 Y6 m% i/ h9 U Z+ [: n: B - 'ready_1':Book_Palabras.encode('gbk', 'ignore'),
! G# w% K: @# M, d' q( r2 g - 'thumb':Book_img,% e% e1 A1 P( P X5 v6 A6 A' f
- 'content':Book_Introduction.encode('gbk', 'ignore'),
3 n+ u+ A; t8 @" L0 {3 |3 J - 'abover':abover.encode('gbk', 'ignore') % y8 E, X* c' [, r5 N. c; \' y
- }+ J0 w. U. ?* ? h- m
- res = requests.post(locoy_url, data=locoy_data)
; o6 ^# ~! D) B/ U. X% ^ - print res.text
7 d" ^* p8 F: u" y5 B - print res.content
- O4 t2 L W0 a& c C( M5 [ - # print Dsd
! u4 k) x& l6 v0 _% e# P P$ M$ L - return res) {% B! |/ X5 m/ ^
-
5 Z1 K- G' a9 {9 E9 H: O; D( s - def __init__(self):
; U( o, W) _; o T2 [. \! ~( A9 ` - self.base_url1 = 'https://www.****.cc/'& d. Z! e; L- ?
- self.base_url2 = '/', {5 d, a+ m. o1 r: M& n; K* g: k0 r% p
- self.CaterId = []
5 o& o7 E, ]5 l U: |8 v0 i3 h - self.CaterIds = ['xuanhuan', 'wuxia', 'yanqing', 'lishi', 'kehuan', 'kongbu', 'nvsheng']/ t4 b5 b$ V* B
- self.page_num = 1
7 Z% K- w: j' W1 q - self.total_num = 200 1 B( v3 {2 \! W) D& m3 W
-
{6 W y. g/ U6 _& P' L: i( w0 I2 n( R& ~ - @every(minutes=8 * 60)1 @$ M7 G1 f! f. H' i3 [! F
- def on_start(self):& [2 ~7 _$ @5 b
- global Cater_Name
& U0 N7 w8 d# G1 V. g% f9 Q2 G - Cater_Name = []
5 d m; A3 a9 G- P2 O3 ` - while self.page_num <= self.total_num:
* q- l$ ^' T% M6 c0 C - for self.CaterId in self.CaterIds:5 a2 Z; n+ _% w1 h
- if self.CaterId == 'xuanhuan':3 w3 \* {; X3 @( v. v
- Cater_Name = '玄幻'
7 D2 T; H: p2 }) `# A - if self.CaterId == 'wuxia':: p9 ?0 a+ O2 c2 [
- Cater_Name = '武侠'
" |* {: |& g$ k, }% { - if self.CaterId == 'lishi':
( C) J- r. E7 ^$ f/ k - Cater_Name = '历史'
% n6 L" t' r) @/ s" D& N% G - if self.CaterId == 'yanqing':
7 H/ F ~0 B2 J0 q7 U6 v) v - Cater_Name = '都市'
9 q. `' S; J9 v - if self.CaterId == 'nvsheng':
* w1 }# A% G7 z' _- L% N - Cater_Name = '都市' 5 q) s5 Y. k/ u8 z
- if self.CaterId == 'kehuan':
; o; w+ C# A8 h8 a - Cater_Name = '科幻' 4 C* w: l1 U: _: u' _6 Z/ c9 l& C9 b
- if self.CaterId == 'kongbu':: u ?4 {8 n* P+ B* }1 _, X
- Cater_Name = '游戏'
8 Z! h2 f3 v' a& P3 p' q4 [ - print self.CaterId/ U7 Z# w- W+ K+ z' `7 h
- url = self.base_url1 + str(self.CaterId) + self.base_url2 + str(self.page_num) + "/" - B1 P7 ]/ g U$ L
- self.crawl(url, callback=self.list_Caterg,save=Cater_Name)
! h8 u2 Z l( m - self.page_num += 1
, h5 A3 b8 d' I& a -
* c' Z, M1 E! x - def list_Caterg(self, response):. h7 w' ]7 Z+ p1 L: r) w! g, U
- Cater_Name = response.save6 e) N. \. l6 _$ Z3 o) Y
- for each in response.doc('.pic-list a[href^="http"]').items():
+ e' N- S6 W/ m( v - self.crawl(each.attr.href, callback=self.list_Caterg_detail,save=Cater_Name)/ V' V- f: d) V$ [5 }
- # q/ J2 _4 d: E( o6 ~/ Z [' r Y+ K
- def list_Caterg_detail(self, response):0 W% b+ N1 d8 Q" c1 W
- Cater_Name = response.save
1 P" ~0 ^$ u8 Q5 Y4 B - # print Cater_Name
' P0 }0 I" M& r1 s4 G' i - Bookname = response.doc('h1').text()$ c N9 |% U. S+ R" |- p: H" H
- print Bookname8 O; h! A& s$ B# o) C
- Book_author = response.doc('.authorname > a').text()# Y+ N# A( S/ ~
- # print Book_author$ _$ h, ]; U# G7 @% A. Z
- Book_Introduction = response.doc('.book-intro > div').text()6 r* g$ B/ f6 E5 O+ j. v
- # print Book_Introduction, b: D. X1 V) y
- Book_Synopsis = response.doc('b').eq(1).text()
9 b3 _2 H- R7 Y; U* L. Z - # print Book_Synopsis
% O. f, `+ I6 p* t( {$ ]: o - Book_Palabras = response.doc('.booktitle p').text().split(' ')[1].split('|')[0]" m- p' f: |" E0 L" v
- # print Book_Palabras
: \3 X( \5 g' J* T - BookIDs = response.url.split("xiaoshuo/")[-1].split("/")[0] #小说ID! @/ q9 Q0 c! ?$ ^( D
- # print BookIDs9 u4 @# R Q- a
- Book_Dates = str(datetime.datetime.now()) ? H! K" `) p2 B- D
- for imgs in response.doc('.bigpic > img[src^="http"]').items():
+ m0 e# E3 p! v! m9 Y/ s+ P - img = imgs.attr.src; b3 p0 q- F: ^+ Y* q
- print img9 v9 c) z9 R: E4 {5 E/ a; W. y
- #小说封面下载: g. R" y( n4 w3 j! M1 _
- extension = self.getExtension(img)8 ]" C3 K+ X/ m7 r( P0 b5 G/ R
- name = self.getname(img)6 Y0 B1 y, G, \3 q$ \
- file_name = name + "." + extension" x/ ?! i0 `% e# r# O( e! f( p7 Y
- imgDir = P_dir + name% b9 T T8 ~& p- N
- Locaimg = imgDir + "/" + file_name
4 n7 { W! ^1 x' P }$ p2 v - print Locaimg+ Z( s9 x2 g$ ?; N3 h$ T$ I
- if(self.download(P_dir, imgDir, file_name, img)): #这2行可注译,图片下载到本地" w/ D$ i0 @0 j& b
- print('attachment url is ' + img) #
6 E N# i% e, V7 D8 P - Datos = {
( I- ~1 c- W3 S2 c( I - "Cater_Name":Cater_Name,
2 p" F7 J5 b* L: B2 v6 M) _ - "Book_author":Book_author,
! }5 Q& q+ {# s: ? X - "Book_Introduction":Book_Introduction,/ ^9 S6 L0 W0 x1 T, ^. H+ ~" ]
- "Book_Synopsis":Book_Synopsis,0 |& d9 u1 n y+ f
- "Book_Palabras":Book_Palabras,
7 F' ^0 }5 t! {4 p - "img":img,& j) `( ]/ v3 [- \2 c/ u8 e7 L
- }
7 u- q* J8 H9 Y1 W9 R: Y - self.add_BookFile(Bookname, BookIDs, img, Locaimg, Book_Dates) #这行可注译,数据库发布接口,方便其他系统的发布
" G, m8 Q; m& Q8 W; b; r - for each in response.doc('div[class="bookbtn-txt"] a[class="catalogbtn"]').items():" m% N% Q( a) U0 ~& C. V
- self.crawl(each.attr.href, callback=self.index_page,save=Datos)
% P3 J0 J' @4 e% y7 O -
' e) h7 s, ?* Y1 ]7 ^- P! H) V- f - @config(age=8 * 60 * 60)
; x w. X; a* x - def index_page(self, response):
c! N2 |) w& L2 P- W& q0 O- G - Datos = {
* d9 V/ c& J- O- Q - "Cater_Name":response.save['Cater_Name'],
' ]; o3 Z/ |" g$ h+ Z- V - "Book_author":response.save['Book_author'],9 j* z3 y& d6 s( u
- "Book_Introduction":response.save['Book_Introduction'],
2 C0 q3 [1 d- K& ?/ \5 H - "Book_Synopsis":response.save['Book_Synopsis'],5 J4 H1 x" V" j' \7 c
- "Book_Palabras":response.save['Book_Palabras'],4 P0 ?. K% s, k) y, `$ a
- "img":response.save['img'],4 B9 S: e# |: w7 b) j. T
- }% \$ c% f7 Q1 h1 V$ z* x
- for each in response.doc('.chapter-list li:first-child a[href^="http"]').items():) N7 o$ J6 n" O5 f, x
- # for each in response.doc('.chapter-list a[href^="http"]').items():
/ f) D+ s6 ^' S - self.crawl(each.attr.href, callback=self.detail_page,save=Datos)9 V5 _3 c3 Q U$ ~
- @config(priority=2): g8 q; f0 x/ s* ~" H( [) }9 z
- @catch_status_code_error
3 S# i/ | z3 m8 l+ E% w - def detail_page(self, response): - D/ o! F' z+ i* N+ a, V
- NewRe1 = u'哈书'
3 m5 j l4 Q: U4 E - NewRe2 = u'huhjsd.CC'
% @$ y% A( \$ m' l - NewRe3 = r'^\\n\\n'& f; r7 J( K1 n; C- A( B$ p- N E
- NewRe5 = u'小说网'. O" z+ N. ?! J+ @! o4 Q
- NewRe6 = u'fgdfgf'# m6 f4 ^) c, O1 a5 [. O* `
- NewRe7 = u'fgfgf'! S7 }' f; K! X8 w7 s3 d* s2 v" a: k
- NewRe8 = u'ffhgf'2 h. L9 j3 n9 i! Y3 Y& A* h
- NewRe4 = r'[\f\t\v+\.\{\(\)\}\!\/_,$%^*(+"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+'
3 {* h4 z) _1 Y. d - ReC1 = u'静思'( }* R1 z w- N4 e' B* J0 `) A* H9 N. Z
- ReC2 = u'aghgf.com'
8 O( p O! K7 B2 X6 h. A. F4 \ - ReC3 = u'aghgfh.com'
- I7 U2 z1 v( g& a/ M6 w - ReC4 = u''8 i# E" J9 Z- k" J9 {; f: p
- ReC5 = u'文学网'
% y' h; N' L0 {8 P/ ] N. a; d- Z# x" P - ReC6 = r'<BR>'' Q3 x3 Z4 F' {3 v$ U: A. y
- Bookname = response.doc('.readlocation a').eq(2).text() #小说名称
# o) c' M. K3 w. U6 v - print Bookname! Q7 @1 P9 u( c5 w
- Cater_Name = response.save['Cater_Name'] # 小说分类9 W$ l! R2 [1 E: {- e
- Book_author = response.save['Book_author'] #小说作者* c5 `9 J, U% S+ X- m4 r
- Book_Introduction1 = response.save['Book_Introduction'] #小说简介
- h9 O# d0 j! G* H5 N( Z6 J$ m' y0 e - Book_Synopsis = response.save['Book_Synopsis'] #最近更新
$ G" f% T/ C `; O - Book_Palabras = response.save['Book_Palabras'] #小说字数( N! r/ Q# D7 P5 C$ w% k6 y) v
- Bookurl = response.url #小说网址
; [. w+ M1 a* Y' M3 L" ]: W - Booktitle = response.doc('.article-title').text() #章节名称% r1 _( Y) Y& j0 J9 F7 N0 ]
- BookID = response.doc('.readset-r span').text() #小说ID$ ?' R7 v$ @" ?9 R5 [8 v9 `' c
- BookConte1 = response.doc('.article-con').text() #小说章节内容7 i& ^8 F; A8 ~4 |1 i1 w0 |
- abover = response.doc('.article-title').text() + response.save['Book_Synopsis'] + response.save['Book_Palabras'] + response.save['Book_Introduction'] #小说状态(连载还是完成)6 v; a1 P L' L+ { V0 b8 v* i
- Book_Date = str(datetime.datetime.now()) # 采集时间+ R: f) K2 |. J0 I8 b. k
- BookConte2 = BookConte1.replace(NewRe1 , ReC1)2 R) ]2 m; c/ b) t( F5 O' T
- BookConte3 = BookConte2.replace(NewRe2 , ReC2)3 t0 M6 T, U( k
- BookConte5 = BookConte3.replace(NewRe5 , ReC5) R% E4 |: t0 u' b
- BookConte6 = BookConte5.replace(NewRe6 , ReC2); \6 c. B& q) k) g3 O) \0 e
- BookConte7 = BookConte6.replace(NewRe7 , ReC2)
* `, p/ \1 g/ h4 d2 \9 @" J1 z; i - BookConte8 = BookConte7.replace(NewRe3 , ReC6)! E9 w' ^ h ~. w. v
- BookConte4 = re.sub(NewRe4 , ReC4 , BookConte8)
" t3 v. d# Q1 v( [ - BookConte = BookConte4.replace("\n\n","<br>")
" H$ o# ~, l3 s+ F( H4 Y" i - print BookConte
& Y& T/ j( B( C' h( M0 X2 k! `6 K - Book_Introduction2 = Book_Introduction1.replace(NewRe1 , ReC1)/ Q e/ b: `" v1 B' O8 I; a
- Book_Introduction3 = Book_Introduction2.replace(NewRe2 , ReC2)5 V+ d4 [6 c4 r) v8 i( ]
- Book_Introduction4 = Book_Introduction3.replace(NewRe3 , ReC3)
+ m. J& V. t( h- q: Y. r3 g+ z - Book_Introduction = re.sub(NewRe4 , ReC4 , Book_Introduction4)
7 \0 a3 v8 @7 h) @) H+ Z- E( G - Titleid = response.url.split(BookID + "/")[-1].split("/")[0] 8 N# D2 M! w9 C3 D% ]/ T) O
- Book_img = response.save['img'], #小说图片
/ a5 Z5 h q/ U( w$ h. L, o# x. d - ( p Z, d# @1 e( H4 l
- #insert into MySQL 小说入库/ d2 M, [" t* o1 t* u( n
- self.add_question(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布
. B+ S; B2 k/ r" U* ~ - self.add_comment(Bookname, Booktitle, BookID, Titleid, Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布8 ~, W, y7 Y+ u4 \
- #post提交发布/ p- z6 H3 [' j( n s' Q2 ~4 |# T: @
- self.add_locoy(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover) #这行可注译,火车头发布接口,不需要可取消
- p8 @* z5 w- @4 N8 g3 v! X# e - Datos = {
, q- V) ^' j1 _1 Q+ J - "Cater_Name":response.save['Cater_Name'],
6 k0 T) ^9 F1 r - "Book_author":response.save['Book_author'],
: y& D" Z5 G; i& m9 X - "Book_Introduction":response.save['Book_Introduction'],
" }$ m- g+ G" P+ {* E - "Book_Synopsis":response.save['Book_Synopsis'],
$ A! q) X/ ^3 ^ d- h$ N, o1 N2 @1 ~ - "Book_Palabras":response.save['Book_Palabras'],
. F( | H/ d1 F* _5 f - "img":response.save['img'],
/ c5 Y) N/ }7 J2 D ^0 ^3 b# x - }
1 I1 V/ h4 r& h% s$ k! x$ f0 S - for each in response.doc('.articlebtn > a:nth-child(4)[href*="/xiaoshuo"]').items():/ { _- H p$ {7 U4 F1 G
- self.crawl(each.attr.href, callback=self.detail_page,save=Datos) 2 D' `" f6 E! [ p R% ~. N# v
- return {
# x0 a( S$ X1 t/ g% B - "Cater_Name":Cater_Name,
' U& b+ t2 f1 l. b" z - "Bookname":Bookname,
- R0 l/ G ?. p: U" i4 ]* x - "Book_author":Book_author,
) L( r) I+ W2 t, g2 Q - "Book_Introduction":Book_Introduction,! a1 v4 ]5 V7 x) k. |" T* W" U
- "Book_Synopsis":Book_Synopsis,
2 W( U/ K+ j4 n( m9 H, \ - "Book_Palabras":Book_Palabras,
" S, I6 T9 {, Y) A, v - "Book_img":Book_img,
) r* v4 g) W. P a - "Bookurl": response.url,
" T7 L: u `8 ?: R - "Booktitle": Booktitle,
. U9 _$ T8 j/ ?8 M) M - "BookID": BookID,
. I. V1 @% ?& c8 A1 ^0 ~ - "BookConte": BookConte,1 V+ x+ K7 w& U) P
- "Titleid": Titleid,9 A# k5 S o7 b
- "abover":abover,4 O3 s/ N* e# p3 |2 @
- # "Book_Date" = str(datetime.datetime.now()),
4 x) E( V: W1 r - }
+ g: w2 b2 d3 p: k - def download(self, P_dir, imgDir, file_name, Book_img):
0 ]& e7 A8 j( b: c5 g+ o6 L* A - if not os.path.exists(imgDir):
8 d: k( M3 X1 ~3 I, M4 |/ M - os.makedirs(imgDir)7 O+ D0 U4 [; i7 B" V
- file = imgDir + "/" + file_name( x9 x, {$ W8 i1 `) @8 j" N5 e: q
- # print file
# `2 o- V, Q/ V a% g( U - f = open(file, 'wb+')
( t- j1 i `; h; e2 ^$ J: d# g - imag = requests.get(Book_img)
8 S1 k, T; `: l m- v! D0 g - f.write(imag.content)0 U. p1 o$ o3 v) \
- f.close()
f& O. [; A; {( r+ K - #保存图片前4 F; F4 h# V$ H# g- d0 C
- def save_imgs(self,response):
I6 G/ X, V7 U h - content = response.content
8 ?5 S, ^5 ]" X' `9 h' y# ?# e- ` - file_name = response.save["file_name"]$ m' T% H8 l0 K! j0 T, n5 K
- imgDir = response.save["imgDir"]
$ F! u) E! ?" o1 g: D% _8 W - file_path = imgDir + file_name6 K. e5 c' F3 K. d
- self.save_img(content,imgDir,file_path), t9 ?/ j; [6 P |- O) M5 Q
- #保存图片
7 u" Z. j* @8 C* y. O - def save_img(self,content,imgDir,path):
; h( O9 }$ f/ @. n; ^ - if not os.path.exists(imgDir):
n0 a6 p; n- A T* b2 q - os.makedirs(imgDir)
$ `+ O5 \* T9 u* ` - f = open(path,"wb" )! L% \) @- `' S/ g+ K
- f.write(content)
- |3 V) K$ Q" A* c - f.close()
3 ^; R n* r) d* ]% E$ Q: D ]+ W - #获取url后缀名5 G6 y# B: {2 e% S
- def getExtension(self,url):
7 n* R! n9 H( S% o; y - extension = url.split(".")[-1]0 _3 |9 }3 j3 c% [& d {7 y
- return extension
+ C# p9 E: B, f - 8 b: Q* s8 Z% N) K
- #获取图片名/ h9 y* o# B5 c4 E
- def getname(self,url):
6 M7 Y" H4 Y- @+ m - name=url.split("/")[-1].split(".")[0]
' x9 U& e8 c& d; O1 s! r8 ` - return name
复制代码
/ p$ Q( F. Q0 K6 ]) \
* z+ N7 A4 \/ a, B+ I |