Python + pyspider某小说站的爬虫,入数据库,火车头发布,资源下载到本地,另可写爬虫!3 M! }& m, \9 k7 p% U1 h
- #!/usr/bin/env python7 n' F C, w9 T7 F$ X- E
- # -*- encoding: utf-8 -*-
+ U* d/ ?1 K2 M1 x) X& W( m# D& J9 O; R - # Created on 2019-05-05 21:43:11+ Z+ B" |- b& l* l0 S
- # Project: XiaoShuo- F. n! y) b; K, |$ Y
-
' H/ n' ]6 W( ^6 F# _ d9 a6 O' a - from pyspider.libs.base_handler import *
3 c/ X: E% \) v9 a - import pymysql
% y! T- H8 n5 @- D; _8 _3 @ - import random
8 N- K* q9 q6 A - import datetime
2 r; }" f+ I4 z; Q- k - import urllib2,HTMLParser,re( M/ x, B! p; }% M
- import os, P/ ~. n0 p5 v: S
- import sys
& @8 s6 g6 ~# W - import re
! k+ Y8 M( J7 X- m - import codecs
! }8 T) `5 X; Q: B$ e - import requests8 Z; p( w* p* D5 }9 W. W
- import json0 S7 ^" y% x* f7 ]. L
- % [& j( r9 @! G8 b5 ^
- class Handler(BaseHandler):% E6 S g4 r9 h. j2 \$ W# j2 N
- global Datos' Z& V: B. Q% w9 E+ y$ O4 i
- global P_dir 7 o+ _- S. e* v; p$ {9 s' J
- P_dir = '/Tools/Debug/' #采集时候图片保持到本地的路径: L C0 K( u; J! e( F7 [
- global Datos
& R: K/ X2 K, U8 {2 S4 Q* J( ]! n - Datos = {}* O7 O7 `: |' i7 E. R) b
- headers= {
2 K+ W/ p! i2 [# M - 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
" f4 x* X( x$ M# ^: E& r - 'Accept-Encoding':'gzip, deflate, sdch',
+ ?; Z4 H6 T7 e8 B' Q - 'Accept-Language':'zh-CN,zh;q=0.8',
$ o" J9 Q9 c: l% c: d$ ] - 'Cache-Control':'max-age=0',- g( x! ]) N0 T8 @
- 'Connection':'keep-alive',3 `4 | f7 @8 G E6 |
- 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'4 w2 z& V& J5 a0 }9 e8 J
- }
( v) L3 {( z2 n, y4 ~ c - crawl_config = {$ G) [. x5 w# s/ D+ K
- 'headers' : headers,
5 O) f2 b: I! L. ?2 J - 'timeout' : 300
1 A' s1 T' Q. i - }& l# `- |0 o. ] ^( K% K
- def add_BookFile(self,Bookname, BookIDs, img, Locaimg, Book_Dates):4 p) r* P) D6 c4 y" a. J
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
' A* Y: ]% z. y' `; m - try:+ V; w! b5 w" ~+ P% _( Y
- cursor = db.cursor()+ H9 `. I; V# I% }
- #注意此处字符串的占位符要加双引号"%s" O% C5 E. h$ @/ z
- sql = 'insert into BookFile(Bookname, BookID, img, Locaimg, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, BookIDs, img, Locaimg, Book_Dates);! S- K/ [; h- b# ]& n% v6 r4 \
- # print(sql)
0 H! J6 i7 _+ K5 A - cursor.execute(sql)* f" X1 O9 ] a( O: v$ T# \
-
% n1 G8 W" g9 J6 b1 ? - #qid = cursor.lastrowid
, q2 A2 B' X/ S! d - #print(qid)
6 H2 v" P/ ~! i; w - 4 W' h# g: r6 Q+ `
- db.commit(); F# C! n' R# s1 J% A( x
- except Exception as err:0 m, d" q7 E0 s, C3 @" ~
- print("Error %s for execute sql: %s" % (err, sql))" U& P, ]/ h) p3 c9 ?+ E. T
- db.rollback()
4 L# Q& K: C3 { - def add_comment(self,Bookname, Booktitle, BookID, Titleid, Book_Date):
7 c: }, U: Z; Y0 f2 [ - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")9 v# v' u" j' Z! M) A
- try:
. X+ v B" N9 a% W) c - cursor = db.cursor()+ ?+ _6 L. F8 k5 v9 y9 ^$ U
- #注意此处字符串的占位符要加双引号"%s"
: I: o' z+ \3 f+ W: r' ]8 R) g# u - sql = 'insert into BookTitle(Bookname, Booktitle, BookID, Titleid, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, Booktitle, BookID, Titleid, Book_Date);8 m1 ^: J4 ] X
- # print(sql)
4 N; w3 T. l) J) [ - cursor.execute(sql)
/ Z: h% n$ E& g% T -
5 q/ a) p& T* p0 S+ K# @) { u; M - #qid = cursor.lastrowid
" j; T6 Z" \: l+ N6 a - #print(qid)
4 P! n' ?* a+ \/ k4 p- I; d -
. A5 r# l( n* b* X+ _$ z- h4 B - db.commit()' R" R4 Y2 [- x9 Z- ]
- except Exception as err:
* D! U/ X+ `# g" x I. Y - print("Error %s for execute sql: %s" % (err, sql))
! Y7 G0 h& E2 s4 d1 g - db.rollback()
) ^" C6 v: g/ ` - def add_question(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date):
3 B1 ?+ x5 R/ p2 [2 G - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")" D t( N$ i' G, \9 R) w ]; t* m
- try:
9 W0 w6 b& F& ^* Y# p9 r/ ^ - cursor = db.cursor()
0 h) a8 h. h ^8 l# O - #注意此处字符串的占位符要加双引号"%s"
% x$ G% @( H0 \. t - sql = ' INSERT INTO BookConte (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date);
% ~2 O2 }( t) M7 b8 G& S& g - print(sql)
! n+ j) N) B; b$ Q/ G! x: r - cursor.execute(sql): w, l+ `) Y2 ~9 z! t3 U; K9 p
- print(cursor.lastrowid)
1 y+ t2 `' |, m, A* `- S) W9 I - db.commit()
" P- q6 i8 c" i6 E - except Exception as err:
- r9 t3 {3 d, \' K$ y% B, Y7 C - # except:
: Y) q; @7 o4 |5 v! `* ?6 s - # print('Failed')
$ G, `$ O O$ J1 J - print("Error %s for execute sql: %s" % (err, sql))
3 j2 S# U$ N. Z' v/ O - db.rollback()
: z( S) [% ` t8 _; K, L - 7 L; ~4 ~% T% _/ L
- def add_locoy(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover):
( @& O$ s: y5 |$ e# ^ - reload(sys)4 I' T0 s( J6 U
- sys.setdefaultencoding("gbk")( o9 q7 F9 F$ o: E4 ~0 J, v
- locoy_url = 'http://www.******.net/locoy/?my=book' #697火车头发接口地址" w* ?1 H4 H* e# v) q
- locoy_data = {" }! L- }2 C4 l+ T" `
- 'my_u':'用户名', #后台用户名
% E: s& a( M* _ - 'my_p':'密码', #后台密码; J& ]8 y, M1 N8 w
- 'subject_669977_net':Bookname.encode('gbk', 'ignore'),
0 n, P) ]$ W2 H/ X - 'caid':Cater_Name.encode('gbk', 'ignore'),
1 M% c8 h4 m8 N& k - 'title_669977_net':Booktitle.encode('gbk', 'ignore'),& v' n" k2 r* \
- 'article':BookConte.encode('gbk', 'ignore'),
/ H" t7 }0 q8 ]( B3 @1 A - 'author':Book_author.encode('gbk', 'ignore'), ]" {* a y; i
- 'ready_1':Book_Palabras.encode('gbk', 'ignore'),6 p5 [# o+ m& G: r) l
- 'thumb':Book_img,
( o" s! {8 z" u) u% ]) U! X3 `5 q. I - 'content':Book_Introduction.encode('gbk', 'ignore'),8 E* y- d2 U) g- r
- 'abover':abover.encode('gbk', 'ignore') 8 {7 ~& Q* ?& P6 x8 Z) R2 ^& ]
- }3 D; D6 W6 O* X. P5 z: p% y- v
- res = requests.post(locoy_url, data=locoy_data)3 j6 r& z3 T: r/ U- c2 H6 q# O/ [
- print res.text8 l8 z# B/ ~5 h9 J! c0 V/ d
- print res.content ~- e6 J; H3 `0 {8 ?4 v
- # print Dsd
W& }* N9 b. q3 e0 K - return res
7 I- j6 y {2 i0 [ -
3 O* v5 r4 U- g2 j2 J - def __init__(self):
5 K+ f( v) ~3 l) B `" o - self.base_url1 = 'https://www.****.cc/': I9 m9 I9 Q+ u3 {3 F! d
- self.base_url2 = '/'& K2 E$ y! } i2 T8 b3 b
- self.CaterId = []# ] a5 G, {) h5 Y! r7 m2 M
- self.CaterIds = ['xuanhuan', 'wuxia', 'yanqing', 'lishi', 'kehuan', 'kongbu', 'nvsheng']: y: u4 b1 f. D
- self.page_num = 1
B- ?$ u8 m4 t - self.total_num = 200
; S2 G, M4 r: \6 X' } -
3 F0 B# T$ h- `' @% t0 i - @every(minutes=8 * 60)* D/ Z5 n2 U# R, k6 c! d6 v" i
- def on_start(self):5 w( k- y% w0 q) o
- global Cater_Name) ?2 A W+ g, t9 A2 N3 M: E' K
- Cater_Name = []+ N# ^# O5 _5 _. P4 ?) M. `( T
- while self.page_num <= self.total_num: 2 x$ Y! z* r% L) X* _* z4 x4 q
- for self.CaterId in self.CaterIds:
+ T. D o- m0 M' I. g S" M/ Y5 W - if self.CaterId == 'xuanhuan':1 b* {( \" d, C& h: d
- Cater_Name = '玄幻'
' d" ~$ ~6 C9 T! S6 E4 b! Q - if self.CaterId == 'wuxia':
: q. r' [9 d8 g3 j8 O" {7 |; f - Cater_Name = '武侠'- S% f/ ?7 m, V5 x
- if self.CaterId == 'lishi':
: o$ X: M5 i- ]+ ^1 I - Cater_Name = '历史' ( x$ V0 g% q( n$ X0 _
- if self.CaterId == 'yanqing':
- e3 t! i( e+ ^ - Cater_Name = '都市'
, i# y) H0 J C$ k" j- V6 y" S - if self.CaterId == 'nvsheng':
+ c' a9 ]: V' v0 b, ] ~7 A - Cater_Name = '都市'
# L7 W9 y: k! N$ Y, \% j+ V; `9 l - if self.CaterId == 'kehuan':
7 j) @5 l* e0 ?* G - Cater_Name = '科幻'
, t! P( A" G' E8 u - if self.CaterId == 'kongbu':
; |* R+ ?& W! [/ b/ ?3 w" N! u - Cater_Name = '游戏' 8 d% c2 v) W/ ]1 e5 E# Y1 X
- print self.CaterId
3 K6 J6 W+ U# E! `( W9 v" C0 b e6 Q - url = self.base_url1 + str(self.CaterId) + self.base_url2 + str(self.page_num) + "/" 8 f* h* y- o4 f7 _3 Z% A
- self.crawl(url, callback=self.list_Caterg,save=Cater_Name)% M2 N+ p) O- T
- self.page_num += 1 4 q) w# N G7 H) \+ |4 r9 i0 Y" {
- 0 r& i* w1 C, `: p* f0 l) g6 {
- def list_Caterg(self, response):
9 S0 r# c( y, {6 r7 Z) U$ { - Cater_Name = response.save
V* S! t5 }7 H) G9 c/ m - for each in response.doc('.pic-list a[href^="http"]').items():
& f5 c e4 Q& c& i) P - self.crawl(each.attr.href, callback=self.list_Caterg_detail,save=Cater_Name)
+ [5 n/ A, ?9 d# N: c$ p7 V' k) g -
3 L7 i, w( E& y% q! J/ O; M - def list_Caterg_detail(self, response):
* M* d1 K9 M. l2 B - Cater_Name = response.save
7 n) u/ B; h3 t( ?( }. P - # print Cater_Name
1 A+ L: A( I* ?" ? - Bookname = response.doc('h1').text()9 h2 |' @- D& {
- print Bookname% h9 U- z6 M4 ?; Q2 o
- Book_author = response.doc('.authorname > a').text()
/ R2 A" T' o0 g# d1 y - # print Book_author
, V: B2 F: X7 k6 @ - Book_Introduction = response.doc('.book-intro > div').text()* y+ n# j$ L! B/ _
- # print Book_Introduction% Z9 {7 l+ n8 i
- Book_Synopsis = response.doc('b').eq(1).text()
2 Q( Y6 @2 c7 e- g5 \5 H - # print Book_Synopsis6 U" ]/ G7 ]) L! ]+ a/ F# I, a
- Book_Palabras = response.doc('.booktitle p').text().split(' ')[1].split('|')[0]
! b8 Q1 W* a3 [ - # print Book_Palabras
, i7 ?" y+ X7 f - BookIDs = response.url.split("xiaoshuo/")[-1].split("/")[0] #小说ID
! b N, b. [0 h( n; w# ~ - # print BookIDs
( H/ o+ ~9 B; G5 _ i R - Book_Dates = str(datetime.datetime.now())
0 \% t! ^0 s& y5 X1 \$ v# S: _: Y - for imgs in response.doc('.bigpic > img[src^="http"]').items():
( b( Z2 r1 C! n, F( d - img = imgs.attr.src
; Q' i( S8 y$ n: Q# p/ } e% K - print img9 O! k. C% V+ A- L8 b( g
- #小说封面下载, f5 |3 o2 c) m0 x
- extension = self.getExtension(img)" ]! @0 I4 A3 V4 b/ a7 b4 ?
- name = self.getname(img)! j& S1 f4 ?! l
- file_name = name + "." + extension: x- R( ^0 J9 }$ S4 ^% T$ T: ]2 J
- imgDir = P_dir + name0 u0 o! D$ A- z! i8 ?1 {2 G! R2 v
- Locaimg = imgDir + "/" + file_name
' k" @0 ?; _0 D+ `' Q, _( D - print Locaimg2 F4 K) c6 e7 j- \5 v
- if(self.download(P_dir, imgDir, file_name, img)): #这2行可注译,图片下载到本地' B& p; l0 k. R# t% ^
- print('attachment url is ' + img) #% d* t1 M t! s( d
- Datos = {) m" U+ S4 e3 q/ `
- "Cater_Name":Cater_Name,, U f# f' G5 C) w: Y9 [) ^' P" K% c
- "Book_author":Book_author,6 N9 j( l% M8 J5 P
- "Book_Introduction":Book_Introduction,- l$ n/ m5 f. Q( T _1 t
- "Book_Synopsis":Book_Synopsis,1 o# a# \: p2 {: I3 V4 }' d' C
- "Book_Palabras":Book_Palabras,! U+ j( u- z0 p( X* A4 ~
- "img":img,
/ |( h- V0 Z5 V6 s8 i% l# e b3 [ - }/ p' F0 j' e4 P/ o2 t: e
- self.add_BookFile(Bookname, BookIDs, img, Locaimg, Book_Dates) #这行可注译,数据库发布接口,方便其他系统的发布( i" ^& x$ }! q% X& ~+ Z# h
- for each in response.doc('div[class="bookbtn-txt"] a[class="catalogbtn"]').items():
5 f3 k/ v' p% n0 }: n' b( S' x" X/ W" N - self.crawl(each.attr.href, callback=self.index_page,save=Datos)
6 i4 m$ \8 a+ `" j$ W2 m8 j# ] - ! P" z6 N- `! p# L+ K% A4 V
- @config(age=8 * 60 * 60) ) h ~* @. `( W) d( o0 a% \
- def index_page(self, response):
8 Y1 F9 O) b$ b: K( ~8 Q, A' ]8 V - Datos = {
% S3 @4 w: x3 c - "Cater_Name":response.save['Cater_Name'],
$ M7 s G1 s+ q% m7 D - "Book_author":response.save['Book_author'],
0 g S) Y: y( Q$ [) W9 m - "Book_Introduction":response.save['Book_Introduction'],
7 g1 F" ?5 z$ [! {4 B$ \7 g - "Book_Synopsis":response.save['Book_Synopsis'],& Y6 Q, S6 C, N( }& r
- "Book_Palabras":response.save['Book_Palabras'],2 ^' @8 b. C# z, N# x
- "img":response.save['img'],
- c. R0 A+ Z0 I9 @! C: f - }
* ?( Z+ h, V- J2 k - for each in response.doc('.chapter-list li:first-child a[href^="http"]').items():
9 b* ^7 F, t( n! D+ B: j - # for each in response.doc('.chapter-list a[href^="http"]').items(): U7 {$ O% d1 E7 r6 k
- self.crawl(each.attr.href, callback=self.detail_page,save=Datos)
+ u5 i' O1 W* q3 o* b) c - @config(priority=2)' _2 [5 c4 H" v0 E: t" A
- @catch_status_code_error s2 ~ ]8 J5 p, B; @
- def detail_page(self, response):
# y4 {8 C0 n$ g6 _$ ]" T( } - NewRe1 = u'哈书'. l! Y1 u8 C; q
- NewRe2 = u'huhjsd.CC'
/ j2 S( z# x7 v$ T - NewRe3 = r'^\\n\\n' F" `; h. m+ T: l/ j b
- NewRe5 = u'小说网': f% _) B# i. [( g' o$ A
- NewRe6 = u'fgdfgf') M7 F" K" b+ H) u! h3 T) A5 L/ S
- NewRe7 = u'fgfgf'
# l/ Z) ]/ x: x - NewRe8 = u'ffhgf'3 R$ A+ ]8 C7 h0 U
- NewRe4 = r'[\f\t\v+\.\{\(\)\}\!\/_,$%^*(+"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+'& q9 y( U$ _1 r4 s1 R2 e
- ReC1 = u'静思'4 C5 K& K0 p6 s1 K" B9 W
- ReC2 = u'aghgf.com'# M3 s+ ^6 C4 K* p1 p/ t: d
- ReC3 = u'aghgfh.com'
: g$ w% C) k- i/ K# I, B+ K7 ^ - ReC4 = u''" s7 n" C% D6 I8 \3 C+ C1 c3 `
- ReC5 = u'文学网'
# Y8 Y9 H3 l6 y4 }$ \ - ReC6 = r'<BR>'
3 D1 _6 I" C, D& c! A - Bookname = response.doc('.readlocation a').eq(2).text() #小说名称
: @: l# I% Y* `! d: y' S1 W! j8 y+ l4 g - print Bookname
2 S: b% H- _/ d. c' T7 o& L - Cater_Name = response.save['Cater_Name'] # 小说分类
% _8 [* O0 e' j6 @8 W: X8 x7 z - Book_author = response.save['Book_author'] #小说作者
* \ ]; |# j( [" ? - Book_Introduction1 = response.save['Book_Introduction'] #小说简介
* c, E. E# g, J7 K - Book_Synopsis = response.save['Book_Synopsis'] #最近更新/ L' h; \ q! M9 a% T+ L
- Book_Palabras = response.save['Book_Palabras'] #小说字数: Z% q2 k% `' ^: ]5 X5 [! T
- Bookurl = response.url #小说网址: ?9 n" ~8 C# ` x D5 S
- Booktitle = response.doc('.article-title').text() #章节名称
/ P' a# W. v5 V% O& f" m# o' |! A - BookID = response.doc('.readset-r span').text() #小说ID) @$ k1 t4 z$ D$ h. c0 R( ]
- BookConte1 = response.doc('.article-con').text() #小说章节内容
5 P' k: n- O3 f4 i( |% r - abover = response.doc('.article-title').text() + response.save['Book_Synopsis'] + response.save['Book_Palabras'] + response.save['Book_Introduction'] #小说状态(连载还是完成)
+ j- B# t* D4 _, O% x- E - Book_Date = str(datetime.datetime.now()) # 采集时间1 T2 [3 g! d9 J& F; \6 w3 K4 \0 E
- BookConte2 = BookConte1.replace(NewRe1 , ReC1)9 h5 o, C8 v2 E, b* ~2 t. V
- BookConte3 = BookConte2.replace(NewRe2 , ReC2)/ I4 G2 w# j) W) K0 N" V$ _
- BookConte5 = BookConte3.replace(NewRe5 , ReC5)
: @6 e" m# b. z' k% N- x1 Q - BookConte6 = BookConte5.replace(NewRe6 , ReC2)/ J0 B% w) d; j/ {5 T, ^4 G- _( W2 F
- BookConte7 = BookConte6.replace(NewRe7 , ReC2): ^/ V, `. `- J8 s/ N
- BookConte8 = BookConte7.replace(NewRe3 , ReC6)
; J8 y9 Y! e4 ~ x( n - BookConte4 = re.sub(NewRe4 , ReC4 , BookConte8)$ c! D5 @' j& k
- BookConte = BookConte4.replace("\n\n","<br>")! k& s& @6 U4 C8 X
- print BookConte5 N, p; o& X1 X3 [: g/ A
- Book_Introduction2 = Book_Introduction1.replace(NewRe1 , ReC1)% O, \+ t& C/ l9 \) P- I8 Z
- Book_Introduction3 = Book_Introduction2.replace(NewRe2 , ReC2) A; e8 L6 J. [- h7 a+ }4 @* H
- Book_Introduction4 = Book_Introduction3.replace(NewRe3 , ReC3)
% g- z* U, H, d9 ? - Book_Introduction = re.sub(NewRe4 , ReC4 , Book_Introduction4)
- d; Q6 A- U' V - Titleid = response.url.split(BookID + "/")[-1].split("/")[0] . d! w' P7 O: {. a# x' Z. N- K1 U
- Book_img = response.save['img'], #小说图片6 B7 T% {& [" V
- 9 z$ z& q4 I2 S. H$ A! ?9 Z! P
- #insert into MySQL 小说入库
0 B, X! r* U1 B' b - self.add_question(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布: E( p) x/ e+ E$ v. P; h2 H3 D
- self.add_comment(Bookname, Booktitle, BookID, Titleid, Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布+ y$ K F' u$ B- V. d" A
- #post提交发布
3 x7 v5 r; C1 k& ], n: M, W6 w. g - self.add_locoy(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover) #这行可注译,火车头发布接口,不需要可取消
' k: m# |8 k: F. x( T. \ - Datos = {
) B2 [4 ?+ t$ P6 H" F- r, T3 a) a) I - "Cater_Name":response.save['Cater_Name'],( \4 Z1 S* D. F6 j2 @) l
- "Book_author":response.save['Book_author'],
7 v( B: \5 z0 M0 ]4 K! i - "Book_Introduction":response.save['Book_Introduction'],
, N' W& w" O4 F - "Book_Synopsis":response.save['Book_Synopsis'],. g8 E G/ h. [* x( ?
- "Book_Palabras":response.save['Book_Palabras'],; r7 F" \9 V$ |1 ^2 _
- "img":response.save['img'],
, G6 e- H6 T" x( t7 T3 w4 k - }
) @' p. p6 T+ O0 ? j9 X6 h( E - for each in response.doc('.articlebtn > a:nth-child(4)[href*="/xiaoshuo"]').items():
w+ i' i! \$ l0 f- n$ i* W - self.crawl(each.attr.href, callback=self.detail_page,save=Datos)
6 T) s! g& A& D5 f, T9 C! r2 l* O$ n4 | - return {
b# u* ?8 G8 p# I% ?( i$ j - "Cater_Name":Cater_Name,; p' p6 T" L% ]- |
- "Bookname":Bookname,
% C# w/ i* L' o; n - "Book_author":Book_author,
7 u: Y9 Z; g* s, Q) b+ y% @ - "Book_Introduction":Book_Introduction," c$ A! N& t8 l G7 m( @3 s
- "Book_Synopsis":Book_Synopsis,: ~7 w' ~* x2 D
- "Book_Palabras":Book_Palabras,# ]2 V" S0 S, E* k1 s
- "Book_img":Book_img,
$ x0 N, J/ b( q Q, X3 w9 p: R6 l - "Bookurl": response.url,
1 ]5 x% ?3 V: Y. c) U* y5 m - "Booktitle": Booktitle,2 O2 D& Y0 ?4 ^
- "BookID": BookID,
7 l' }+ h3 g( q d+ z+ }; K/ G - "BookConte": BookConte,
* k- l$ i" `7 y, u" j% r7 M - "Titleid": Titleid,9 g: D5 [6 y3 o% V9 A/ Y- S
- "abover":abover,. a: M5 s5 l* V! p) u0 e
- # "Book_Date" = str(datetime.datetime.now()),$ _9 C- I, c& i# H2 w
- }
- Q1 z0 c3 y5 E: {2 z/ ?8 [ - def download(self, P_dir, imgDir, file_name, Book_img):0 T& b% q' P. Z# N, b5 {- V
- if not os.path.exists(imgDir): & K* @' M# J: ^2 C
- os.makedirs(imgDir)
; l# U, p: ~' c2 Y7 I - file = imgDir + "/" + file_name, Q7 ^" ]" D3 \8 C# {
- # print file
- x0 H+ t* [6 m: m; j - f = open(file, 'wb+')( K2 p @3 e( T9 n
- imag = requests.get(Book_img) ( `( Z) W1 } z" S
- f.write(imag.content)+ A' d y7 y2 _' b, N
- f.close()
; y. M6 W; h, f3 A - #保存图片前
& X+ F3 W4 E; F, B6 w - def save_imgs(self,response):0 `% A3 W# [1 P, N0 ~
- content = response.content: n$ o+ K5 i5 E5 ?/ u; B
- file_name = response.save["file_name"]% t" |0 W; b' i; C8 l; S
- imgDir = response.save["imgDir"]
5 \# Q0 s4 L, t, c/ B - file_path = imgDir + file_name3 l9 q: o" {. s4 n3 r% c5 }
- self.save_img(content,imgDir,file_path)6 W1 L9 A% v+ P2 a, D: m+ m
- #保存图片
" z- o* b! M# O7 f8 K - def save_img(self,content,imgDir,path):, w, S/ ^2 z3 {/ t
- if not os.path.exists(imgDir):
4 r5 E i0 m" V - os.makedirs(imgDir)% o; i( q, g6 k# h$ x% G
- f = open(path,"wb" )2 c4 J1 O+ p5 P Y0 l- j; E
- f.write(content)
2 g( u. _4 F1 z- {7 `3 k - f.close()( { ~: q% W; F6 V j
- #获取url后缀名: x/ n4 p0 F5 H1 x6 n+ @( l! X2 x
- def getExtension(self,url):
" C( h( l4 E- T& K ?9 T2 }; x - extension = url.split(".")[-1], L& Y y* S5 L; k7 ~' P
- return extension 3 V& } k% w, g
-
% d2 E$ [. ?7 x( P. L - #获取图片名
7 n! f9 E. K! w1 w( R/ U5 \' B - def getname(self,url):
1 [# z$ {, J0 r0 Y - name=url.split("/")[-1].split(".")[0]
) F7 N P) T) ]( k1 T - return name
复制代码 , }; _; l$ S' g& c6 N2 G; P
$ @5 j$ u4 ~8 S# Z: Y6 ~% ~5 x) h2 O
|