Python + pyspider某小说站的爬虫,入数据库,火车头发布,资源下载到本地,另可写爬虫!
' e* M$ W' S( ^/ X- #!/usr/bin/env python% S$ k3 b& L# d% t
- # -*- encoding: utf-8 -*-6 F: }1 h( G4 ?( a" [) [8 z, |! n& M
- # Created on 2019-05-05 21:43:11
w0 i* t8 ~6 e+ N" h# E3 g - # Project: XiaoShuo4 `, _( P5 f7 O9 t, k2 B% |
-
4 `0 {2 X' I7 ] - from pyspider.libs.base_handler import *. O Z4 [! p& n4 v, O) L* `
- import pymysql
# g' r8 m+ H Y* X8 f# H" L - import random
! P* X: e1 w* R3 n, G4 b - import datetime* R+ E( i2 j: Z: G! w" v
- import urllib2,HTMLParser,re
) C* v; F3 U2 s$ U) m* L - import os s5 ]3 z& x' T7 ~
- import sys e/ J) ^& f- i. ]
- import re& M" W; _$ u9 [1 K% h' Q1 B# j9 A
- import codecs" O3 N! A' `% c/ O) ~2 g( T
- import requests
9 D0 D( X+ H9 g* y7 h6 |4 p, d - import json7 P8 W4 P" M: x" Y# v
-
9 S2 G! B& `' P; {$ S$ k1 }. a" W - class Handler(BaseHandler):+ U7 z; h' D3 o5 }
- global Datos
7 m- L5 }- z" v" @; s - global P_dir $ s2 W8 v, e3 n+ ]# H8 }
- P_dir = '/Tools/Debug/' #采集时候图片保持到本地的路径
8 N6 ]% b* L7 h+ H" ` - global Datos! a7 x$ n- m' h- G/ [4 X
- Datos = {}
8 d" Y1 ]; Z) y9 \4 _3 I$ Z% k% v - headers= {
3 |" f) E: h2 d; T1 P5 v - 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
& M' S: b! {5 V" ^ - 'Accept-Encoding':'gzip, deflate, sdch',+ V. X3 @0 ~! J
- 'Accept-Language':'zh-CN,zh;q=0.8',
% A1 g' Z* Y4 Y' h6 k3 E - 'Cache-Control':'max-age=0',
; v4 X$ Q9 ?0 b3 p# j0 w - 'Connection':'keep-alive',
! }0 U! { ?' H$ j# V - 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'/ `6 ^( Z6 J2 D" o3 u$ {# Q
- }# j( [2 U* c8 C! z/ k
- crawl_config = {6 m1 v" G+ R% D( ?
- 'headers' : headers,, K6 P. e$ ?: U$ g) j: f0 s" G
- 'timeout' : 300: \( P r$ i/ z8 a( E
- }( R$ u' c, U+ n2 Z4 [
- def add_BookFile(self,Bookname, BookIDs, img, Locaimg, Book_Dates):# Z4 t7 o( q5 Z3 `! F- R
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
8 f; V/ V, M( f' H+ a3 i% a/ Z/ E - try:
+ u9 h O' g2 [3 G v. g; v2 ` ] - cursor = db.cursor()
- c% N& I7 j" |- G - #注意此处字符串的占位符要加双引号"%s"
1 D9 L+ V& f; I' q) _( Q9 m - sql = 'insert into BookFile(Bookname, BookID, img, Locaimg, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, BookIDs, img, Locaimg, Book_Dates);
+ _( S3 `/ g: H4 {3 C9 m4 c+ H - # print(sql)% ?% ^1 I6 l; N5 z1 N, a! ~% R. o
- cursor.execute(sql)* C. M4 _7 p5 U7 g
- - j% R4 W e, I
- #qid = cursor.lastrowid( [: x8 f% s& M u
- #print(qid)
3 Z: B' Y/ q+ |! o7 o: V9 {- w# F9 ~ -
; S# g2 L( i& J, c. B8 @ - db.commit()
2 B: \; g4 S3 d6 A7 M - except Exception as err:9 a" A/ m9 f i% }# _$ O0 x4 d3 O
- print("Error %s for execute sql: %s" % (err, sql))
6 b8 s# T% n) Z; M - db.rollback()
/ Y' }5 b6 m* V- B - def add_comment(self,Bookname, Booktitle, BookID, Titleid, Book_Date):( A( T/ y3 h& {+ n, I
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")$ R9 z6 h: B7 @( }( | K" l1 Q
- try:" B5 A! K4 y1 L A
- cursor = db.cursor()) ?1 P& v0 Q$ e: d
- #注意此处字符串的占位符要加双引号"%s"6 @/ i2 P* E% k/ y9 o( H! q7 d4 k
- sql = 'insert into BookTitle(Bookname, Booktitle, BookID, Titleid, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, Booktitle, BookID, Titleid, Book_Date);
: ]. |4 W0 P3 f! U - # print(sql)
; t9 e+ x- y# t* i& K/ c" y+ V3 p - cursor.execute(sql)
4 q$ m& Q* G+ C: @% @ -
4 J+ n. y/ F$ s8 d - #qid = cursor.lastrowid
$ D$ I, h$ h- k3 A9 N- l - #print(qid)7 q- k/ {0 a( S# K
-
* Z0 d4 R% |& F# {4 m - db.commit()
" P& E A. U f3 X: I0 \# S8 E - except Exception as err:
0 T- W# _% }4 K8 N" G5 | - print("Error %s for execute sql: %s" % (err, sql))' G6 ?. ?' N0 }
- db.rollback()
4 S% g% i0 Q' s - def add_question(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date):: s/ t( e& V# S+ @, U
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
* D/ ]8 U/ H$ [* }, |3 L - try:
5 \: Z" u+ Z6 q% E! u - cursor = db.cursor()* ~: \1 b. m: _3 e' e: u
- #注意此处字符串的占位符要加双引号"%s"5 F7 d2 c( G' @3 ~) W/ v
- sql = ' INSERT INTO BookConte (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date);- M9 _. o k1 F6 N2 I
- print(sql). {" }3 h0 t# D$ z* D
- cursor.execute(sql)+ O- x& A! ?: I1 V, @
- print(cursor.lastrowid)
+ M! I6 g) e: B/ v) C% K" | - db.commit(). J- P2 T/ c! K0 @
- except Exception as err:: C+ H3 L; i0 D
- # except:; c% g* Z2 f0 |7 q
- # print('Failed')
! l* T" a3 ?7 g: m! | - print("Error %s for execute sql: %s" % (err, sql)): E% r% a+ G: k8 c, S d2 u
- db.rollback()
( `) B4 Y% O0 K* Y( x5 X - 8 V% Z9 P9 G5 m3 u' i2 F4 ~7 |
- def add_locoy(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover):
7 B. b) ^5 M; N8 X" A8 j5 X0 Z - reload(sys)1 Q1 U! }5 R3 ]3 {9 E
- sys.setdefaultencoding("gbk")! o0 ~6 Z/ ?* j, y1 I
- locoy_url = 'http://www.******.net/locoy/?my=book' #697火车头发接口地址
4 i. u& j1 u9 p3 J* `; B: A1 h - locoy_data = {
$ B- M1 G# q y, V+ T - 'my_u':'用户名', #后台用户名
; I1 t" c1 A+ g/ X$ q5 ` f) f - 'my_p':'密码', #后台密码
0 J. p* B& @2 L% _$ U! s - 'subject_669977_net':Bookname.encode('gbk', 'ignore'),; L- _' K! k4 a: u
- 'caid':Cater_Name.encode('gbk', 'ignore'),
9 n$ a& q8 j0 F4 ] - 'title_669977_net':Booktitle.encode('gbk', 'ignore'),
" v: T7 B2 S. f& Y8 J4 f - 'article':BookConte.encode('gbk', 'ignore'),
5 r5 ]2 t3 i- m& X5 N8 L( q - 'author':Book_author.encode('gbk', 'ignore'),
, F* m% b7 x8 H4 B- Y4 B - 'ready_1':Book_Palabras.encode('gbk', 'ignore'),
8 x0 f" D4 U+ R. { - 'thumb':Book_img,
& B2 J( G3 D1 T7 Q" W+ M0 } - 'content':Book_Introduction.encode('gbk', 'ignore'),
9 R* r+ U) ^7 w- d! w* [$ C - 'abover':abover.encode('gbk', 'ignore') 7 b7 C Z1 G/ X4 s
- }
# \$ s$ j3 E. U: \3 I4 ~0 ? x - res = requests.post(locoy_url, data=locoy_data)$ u b# J5 U# G) _" Y( \& W
- print res.text
3 v8 W& |) Y/ S - print res.content9 v" \% ^1 @8 C0 x2 N! Z
- # print Dsd5 X% i1 s- t4 q( g
- return res
- Z2 ?/ d3 Y7 b9 k& x! C: C- \3 J - - _! G" {# |" u5 o V) \2 E7 J
- def __init__(self):
- p, ?( c7 A& x+ Y0 S! Q; W" J6 ` - self.base_url1 = 'https://www.****.cc/'3 v2 Q! G" H) T) f
- self.base_url2 = '/'
/ u# r# J* F) q6 B A - self.CaterId = []
/ t0 D: m8 V; @+ S/ _ - self.CaterIds = ['xuanhuan', 'wuxia', 'yanqing', 'lishi', 'kehuan', 'kongbu', 'nvsheng']
/ ~# _1 X9 m, w - self.page_num = 1
0 V. x$ v# D, A: Z2 g+ b/ F7 N - self.total_num = 200
7 H) t, t( _) _4 ]& A) L; m- X - 4 n' m0 x" o) w" Z; d: ]- m$ |
- @every(minutes=8 * 60)3 A' D7 ]8 q' s( Q; I4 s
- def on_start(self):; b. ]( E- |; W0 Y0 q% [
- global Cater_Name
/ y+ m8 `: t) W" J. z* F - Cater_Name = []" ]6 U0 f% S. _
- while self.page_num <= self.total_num: 9 A2 ^8 }6 ?3 k' ]6 y
- for self.CaterId in self.CaterIds:
! R, N& }, W- T. b8 u - if self.CaterId == 'xuanhuan':
( a5 F5 F( k7 ^+ T( x" s - Cater_Name = '玄幻'
" v; Y, ]7 K6 k( i) p - if self.CaterId == 'wuxia':
* |) n0 l! M' l" W5 z1 J8 _ - Cater_Name = '武侠'7 P" v7 I( \2 G5 h" R4 r9 N$ Q
- if self.CaterId == 'lishi':& p2 j- Y# j! B p0 B1 h8 W8 T1 K
- Cater_Name = '历史'
# X! ?! R$ v4 c - if self.CaterId == 'yanqing':) q9 e" t0 [# K; e3 E0 L- H/ c
- Cater_Name = '都市'
; e& n" Y ~: X8 q6 I& | - if self.CaterId == 'nvsheng':+ v4 o4 D+ t! j/ c
- Cater_Name = '都市'
. [8 l3 g: I2 \, p! ]% I/ e, x! k - if self.CaterId == 'kehuan':# r& C/ k7 j f& D2 q) |$ Z
- Cater_Name = '科幻' 4 a. V7 S2 ^$ p0 L- T* y: I3 p
- if self.CaterId == 'kongbu':; D( ^+ |0 I8 E( V' a, r( f
- Cater_Name = '游戏' 7 w3 d3 h) k! u b1 v
- print self.CaterId' v2 Z0 k3 I6 \& I8 I$ m
- url = self.base_url1 + str(self.CaterId) + self.base_url2 + str(self.page_num) + "/" / k4 E6 ~1 V5 N8 u6 V# }
- self.crawl(url, callback=self.list_Caterg,save=Cater_Name)$ M4 A U( f' w4 D B! ]+ [
- self.page_num += 1 , U8 \! @9 _4 S, R" K! H/ V
-
/ I' S; x) W/ @2 i - def list_Caterg(self, response):
) u( y+ Z$ e! B- m/ } - Cater_Name = response.save* n- B" U1 m. ?+ R/ a) J) J3 G
- for each in response.doc('.pic-list a[href^="http"]').items():! K7 V ~% |' Q/ |
- self.crawl(each.attr.href, callback=self.list_Caterg_detail,save=Cater_Name)
y ^/ ~9 C3 a) o9 Z5 s+ J+ Y" z - 0 X' m) j& E8 e- ^6 |8 E2 Y
- def list_Caterg_detail(self, response):
, |' e, f8 z- ^2 \8 b* O8 A - Cater_Name = response.save
9 |" A! R7 T0 N. Q t$ P7 x* x+ P) v - # print Cater_Name
: J2 \# v0 C' J2 x - Bookname = response.doc('h1').text()+ ?, @8 t5 t& f4 O
- print Bookname
, S1 u$ n1 P; \" _# S `! ? e% f, Y - Book_author = response.doc('.authorname > a').text()
, J! C0 B" j& f; j' p, L( X - # print Book_author/ o0 F! E" m$ U7 v/ U
- Book_Introduction = response.doc('.book-intro > div').text()8 V1 Q8 p( ~% [8 K/ v
- # print Book_Introduction
" m& h3 B+ W3 W! T( C, ^ - Book_Synopsis = response.doc('b').eq(1).text()
% l& i& M F- i( f# ^7 s - # print Book_Synopsis1 @$ V' D4 e2 c- ?) E
- Book_Palabras = response.doc('.booktitle p').text().split(' ')[1].split('|')[0]9 b! s4 k- ]& a7 K6 h$ _6 A6 u
- # print Book_Palabras+ O( F/ v. a' s7 |5 U
- BookIDs = response.url.split("xiaoshuo/")[-1].split("/")[0] #小说ID- s: i; c- }4 R! n& J1 {
- # print BookIDs- y ` I9 U8 E
- Book_Dates = str(datetime.datetime.now())
6 g4 X& _( c$ @& W! e3 F% n0 @ - for imgs in response.doc('.bigpic > img[src^="http"]').items():
7 {$ K i: C6 D( Q. F - img = imgs.attr.src- }" a4 ~, t- S6 D4 T0 H, T. X
- print img+ C) C3 j4 b! R+ U( j" f- B
- #小说封面下载
4 t+ g ^+ a1 P3 B4 i" C6 j - extension = self.getExtension(img)
8 X& L7 g0 n" ?( O - name = self.getname(img)* x% g- s8 ^- i0 O( M
- file_name = name + "." + extension
: j/ e* P& z: i: @, z - imgDir = P_dir + name! {5 w! |0 v) a* N, u; {
- Locaimg = imgDir + "/" + file_name5 Y' ^+ }# ~, r+ ?
- print Locaimg0 s6 c4 {/ H' t. d
- if(self.download(P_dir, imgDir, file_name, img)): #这2行可注译,图片下载到本地
5 c& Q# @( P0 H2 ]8 j( p - print('attachment url is ' + img) #4 u. X+ S2 a" N8 o2 v0 h
- Datos = {
$ J* G" E; ~; N/ t. z: H# \ - "Cater_Name":Cater_Name,
' h8 X- R' A* ^6 ]* l& Y - "Book_author":Book_author,
! w I) Y& a: F( U* | |# x: [3 ^ - "Book_Introduction":Book_Introduction,$ u; }6 K) W% j
- "Book_Synopsis":Book_Synopsis,
7 q7 \; Z5 J7 N; k3 y7 c6 S* S; H - "Book_Palabras":Book_Palabras,7 J A3 I2 e0 U3 |9 `- Z- P
- "img":img,( j' E- e4 L+ D' w4 X7 k0 f
- }
* t0 ?6 \; @! h/ Z; f; p3 J - self.add_BookFile(Bookname, BookIDs, img, Locaimg, Book_Dates) #这行可注译,数据库发布接口,方便其他系统的发布5 g# y5 }) D& a4 B5 }( G
- for each in response.doc('div[class="bookbtn-txt"] a[class="catalogbtn"]').items():
8 U" G3 z* J2 Q - self.crawl(each.attr.href, callback=self.index_page,save=Datos)
; _" z9 O1 r1 K -
/ W5 ^2 ~3 Q% k& o& H( s4 t - @config(age=8 * 60 * 60) 5 k' y2 a$ }+ R) g' R1 t
- def index_page(self, response): . [/ Y/ E+ `8 m! [, |
- Datos = {
* y6 i, \' x$ }: U# p - "Cater_Name":response.save['Cater_Name'],: Q, D& ?# K/ m$ x$ q, ], N
- "Book_author":response.save['Book_author'],4 ~5 l2 y# {3 ?" O* c
- "Book_Introduction":response.save['Book_Introduction'],8 A0 M) R! M0 B5 w0 e
- "Book_Synopsis":response.save['Book_Synopsis'],& S6 R+ S0 V' ^$ d1 F% o( y7 y
- "Book_Palabras":response.save['Book_Palabras'],4 L. w+ {! d1 d) \
- "img":response.save['img'],' W+ J- L; j9 R: F. N# X( ]# P/ {
- }' \9 G7 w1 U' k5 c$ _" T+ D
- for each in response.doc('.chapter-list li:first-child a[href^="http"]').items():
& b* ? ^8 A. Y' W - # for each in response.doc('.chapter-list a[href^="http"]').items():
9 t3 r- k+ \) M - self.crawl(each.attr.href, callback=self.detail_page,save=Datos)
; k' g" G5 x$ u - @config(priority=2)
2 P; U1 g* n3 c0 N- i4 z1 l - @catch_status_code_error/ L Q: l* k1 H' a
- def detail_page(self, response): # p- i! d o' S0 w$ T5 ~
- NewRe1 = u'哈书': `: t: T& q3 Y7 {+ Y
- NewRe2 = u'huhjsd.CC'$ U1 O( G7 P" b# c
- NewRe3 = r'^\\n\\n'0 \& ^6 v* E$ }/ x* K( o) [
- NewRe5 = u'小说网'
& A n( `& `8 X" i - NewRe6 = u'fgdfgf'
* h4 E! p1 v! q- n* M {6 T - NewRe7 = u'fgfgf'
& X! M1 C% q7 l0 B8 H, p$ R: r - NewRe8 = u'ffhgf'
3 E/ b; L/ w/ y# y: s - NewRe4 = r'[\f\t\v+\.\{\(\)\}\!\/_,$%^*(+"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+'* }' n7 C/ L w" m" G6 k( V
- ReC1 = u'静思'
& l" F3 f& e* y: z2 q m, h9 d - ReC2 = u'aghgf.com'8 T" { w( r. z. u
- ReC3 = u'aghgfh.com'* I: |+ Y% B; y; Z) M, H( r
- ReC4 = u''
9 Y- w3 a M( h6 K: }8 [ - ReC5 = u'文学网'
. y X- y; E- Z1 j8 G0 F - ReC6 = r'<BR>'
6 D" F& v# M" Q; Y0 y5 N - Bookname = response.doc('.readlocation a').eq(2).text() #小说名称
- z# O6 t- w, q1 a0 D - print Bookname- ^/ L, m r$ I) f
- Cater_Name = response.save['Cater_Name'] # 小说分类4 x; [8 z- r& Y8 G8 ]
- Book_author = response.save['Book_author'] #小说作者/ }6 y4 N* [ r: ^ [
- Book_Introduction1 = response.save['Book_Introduction'] #小说简介/ O' Z+ @/ W" U7 F0 G: K
- Book_Synopsis = response.save['Book_Synopsis'] #最近更新
1 v5 O; W" {+ S( {6 l/ F: a8 Y - Book_Palabras = response.save['Book_Palabras'] #小说字数" g; o; B0 X0 c! ]( p4 D) a
- Bookurl = response.url #小说网址
/ O( Z) [9 }/ K3 M! a- d - Booktitle = response.doc('.article-title').text() #章节名称# Y) K7 N" p: Z" j
- BookID = response.doc('.readset-r span').text() #小说ID
# V/ R' i! o" s5 `& U6 i9 [+ e - BookConte1 = response.doc('.article-con').text() #小说章节内容
+ j9 B. M, z* Z" [4 f - abover = response.doc('.article-title').text() + response.save['Book_Synopsis'] + response.save['Book_Palabras'] + response.save['Book_Introduction'] #小说状态(连载还是完成)3 H' R& p) W i, T" V
- Book_Date = str(datetime.datetime.now()) # 采集时间
. h1 i) L5 w { J+ x! @9 ^* H - BookConte2 = BookConte1.replace(NewRe1 , ReC1)
2 `, S9 Q# U' P p9 y - BookConte3 = BookConte2.replace(NewRe2 , ReC2)
" g$ u n9 E2 b% p s& z. k1 c - BookConte5 = BookConte3.replace(NewRe5 , ReC5)
+ _/ N8 D2 K' `& N l/ C - BookConte6 = BookConte5.replace(NewRe6 , ReC2)% P2 m$ s4 Q+ V; `- r
- BookConte7 = BookConte6.replace(NewRe7 , ReC2)
7 g0 w y3 y+ ? i/ o - BookConte8 = BookConte7.replace(NewRe3 , ReC6)
; K4 F; L) P, x' Z; V) t9 { - BookConte4 = re.sub(NewRe4 , ReC4 , BookConte8)
9 f O+ O$ e+ @" m - BookConte = BookConte4.replace("\n\n","<br>")
+ f" o$ O+ @! s: O3 c0 m; V - print BookConte6 V: b" K0 T0 H6 Y7 V
- Book_Introduction2 = Book_Introduction1.replace(NewRe1 , ReC1)9 p( Z3 S: B7 b R) b) u
- Book_Introduction3 = Book_Introduction2.replace(NewRe2 , ReC2)2 V7 {1 O5 k% r+ j
- Book_Introduction4 = Book_Introduction3.replace(NewRe3 , ReC3)/ P: U6 I, I$ ^; Z* V* V
- Book_Introduction = re.sub(NewRe4 , ReC4 , Book_Introduction4)
5 O& A/ n& @3 N; b, V - Titleid = response.url.split(BookID + "/")[-1].split("/")[0]
7 n) _9 @3 k0 j! K; Q7 s) ~ Y. \ - Book_img = response.save['img'], #小说图片6 F- z/ @5 C# t% ~1 h7 Y
- * L g, m V* s6 F$ I7 Q( n
- #insert into MySQL 小说入库4 E* d0 ^% J: u8 _8 _, T) E
- self.add_question(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布4 A/ B& k, A( r% K/ V
- self.add_comment(Bookname, Booktitle, BookID, Titleid, Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布+ h8 O4 p+ ~; K4 u/ c
- #post提交发布/ \ ?8 i4 I, n
- self.add_locoy(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover) #这行可注译,火车头发布接口,不需要可取消
6 K+ I; T" G% @ g M7 Y) d - Datos = {
5 o) m% s n- ~) R( v U - "Cater_Name":response.save['Cater_Name'],
+ {* R5 h( `5 j5 [/ N3 M - "Book_author":response.save['Book_author'],
* y- p5 C. q, C6 k% i/ j0 R+ O - "Book_Introduction":response.save['Book_Introduction'],
: O& j; |' y/ W9 r0 O/ G( a3 M7 | - "Book_Synopsis":response.save['Book_Synopsis'],5 f) ?5 I- |$ n" g8 Q# l/ O/ D( K
- "Book_Palabras":response.save['Book_Palabras'],3 v' h3 d9 d( y* T9 s+ T* l
- "img":response.save['img']," B, E% c& p# c, ^( F
- }( e3 K4 v' J2 H
- for each in response.doc('.articlebtn > a:nth-child(4)[href*="/xiaoshuo"]').items():/ _1 c, t- q! m, A, |
- self.crawl(each.attr.href, callback=self.detail_page,save=Datos)
1 X& ?+ J. s. C# P& L; T$ E, i - return {
( o8 ^, U( _& h4 }" e - "Cater_Name":Cater_Name,
0 i) D3 N$ W$ M7 {3 n6 L# v3 n% H - "Bookname":Bookname,
5 Q) r9 k' c' T( Z) g - "Book_author":Book_author,3 z6 J0 n+ {: l
- "Book_Introduction":Book_Introduction,2 Q6 E" M! c3 {0 t! \2 ~) ~! |4 e
- "Book_Synopsis":Book_Synopsis,0 ~* L) r& {+ x/ [, K
- "Book_Palabras":Book_Palabras,' c8 ]" G$ C$ E4 k2 M) J( V
- "Book_img":Book_img,
* L! F- ?7 s% z1 c, y1 J - "Bookurl": response.url,
) F' O% X' U2 g. G* ` R ~ - "Booktitle": Booktitle,/ ]- V b! y* E5 F' }4 D/ [! r
- "BookID": BookID,
! y0 n) A2 _5 V- U* c: n9 y - "BookConte": BookConte,
% J2 c, H! `- Z. R$ [4 D - "Titleid": Titleid,4 P) Z+ D. j; Z
- "abover":abover,& m5 i$ l$ c& \" w8 ?- {
- # "Book_Date" = str(datetime.datetime.now()),
- T$ D e+ e( h( U - }4 ^/ e8 N3 E3 O& z& A6 X$ Q* P8 H
- def download(self, P_dir, imgDir, file_name, Book_img):* E0 E! |3 G* o v' k, E4 B; N9 v8 u
- if not os.path.exists(imgDir): : h& t4 w' C+ O/ `$ ?) b' t3 N
- os.makedirs(imgDir)
2 h4 U6 o- c6 ^0 |! i, i - file = imgDir + "/" + file_name
. L& O/ S4 S# l - # print file
; ]" s6 A; w( p. l: ]- s - f = open(file, 'wb+'), v) j+ P% b/ w3 \) b3 S
- imag = requests.get(Book_img) ! ]$ w0 t D' A, G7 v+ q# K( X; r
- f.write(imag.content)
* c& k9 [' p. n, h - f.close()
) H( Q P) T( O t7 E - #保存图片前
9 Y/ \' m& }* V9 r% I7 G& g - def save_imgs(self,response):! O1 _1 z# P: A! _! {( q. q4 J
- content = response.content& h' [; O6 d/ D. r" y& G: M! I) R
- file_name = response.save["file_name"]
3 P# ]. N7 h7 P8 q& M4 D! X - imgDir = response.save["imgDir"]
" _7 C' {8 H0 R9 F - file_path = imgDir + file_name9 u1 A/ U, T# U; \
- self.save_img(content,imgDir,file_path)8 B# o# d5 H" Z# m
- #保存图片
; p' {! D. z% C, C( h i5 \9 D - def save_img(self,content,imgDir,path):
/ W/ O9 D0 J: _' J2 C* S( b/ A - if not os.path.exists(imgDir): , M5 h z7 V& v1 r: X9 x
- os.makedirs(imgDir)
3 [7 N2 U: A+ Q+ x$ q, v8 w - f = open(path,"wb" )# D4 t }$ T8 [
- f.write(content)
8 h1 Y! C! i! y# w3 Z2 \ - f.close()
7 V, e+ ]. }- l7 k7 i: H5 f ?1 O - #获取url后缀名
" n1 }) T( ^& l - def getExtension(self,url):
% p7 {% u K. N& ^$ G# ~ - extension = url.split(".")[-1]# F" h" w L% Q! Z; Z
- return extension
7 z! A8 G7 D& V% v8 s' I9 B \ -
: P# k5 m( e$ L* d* }- p2 k - #获取图片名
. K/ t' d6 D! H6 j3 j - def getname(self,url):
* s% t* ~; l( e1 S0 u - name=url.split("/")[-1].split(".")[0]
1 K% L+ a% V$ y1 v - return name
复制代码 / \% }1 n o" r: ?. V/ a1 S
% h8 A( B, m( u! U. ?
|