Python + pyspider某小说站的爬虫,入数据库,火车头发布,资源下载到本地,另可写爬虫!' n9 k! r( F( {) {
- #!/usr/bin/env python) X6 H3 b6 C8 Q
- # -*- encoding: utf-8 -*-
; i' d7 V' v" q" O5 ^+ h9 k4 O/ u E - # Created on 2019-05-05 21:43:11
$ }, z) ~1 ?' g - # Project: XiaoShuo
- H2 j6 \9 T3 `0 l. D% ?* w - . p% w, h) B- ?
- from pyspider.libs.base_handler import *, \9 }3 D' t& f2 B+ O
- import pymysql! d" m% N- \% i6 g- ^7 a
- import random
/ e( D" G1 E% c, d$ Z - import datetime
" Z) s; i" A% r8 ?/ c& C - import urllib2,HTMLParser,re: b8 d: I5 G& I7 }
- import os
# K+ t; N% v: f* d" e: f, E - import sys9 m" L d; w" M+ Y
- import re
; h2 w. z7 q: M3 j/ p - import codecs
! x) M) ]9 e; N$ N/ [' B% c2 f2 y! m - import requests
$ _8 m5 r: j: Q - import json1 V C: p8 t' r" x, m7 p2 P
- 8 w7 b% g8 H/ y' U; W( I
- class Handler(BaseHandler):$ _9 q8 F& l) I4 f) O7 x( Z) Q9 O
- global Datos
O8 m. M1 l3 X+ C - global P_dir 4 {0 L1 e% q: o% H6 j
- P_dir = '/Tools/Debug/' #采集时候图片保持到本地的路径
4 s+ t C: O5 I% H+ h |2 _9 T - global Datos) y* \- u% C; |0 `, T
- Datos = {}
" O5 h2 O8 }: Z7 d" q - headers= {
% v8 Z2 }/ H- n. k* k - 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+ w F7 c6 g% n2 U - 'Accept-Encoding':'gzip, deflate, sdch',
, P& F9 N/ U' ^2 S* y - 'Accept-Language':'zh-CN,zh;q=0.8',; p6 }: k) L2 M: o. R
- 'Cache-Control':'max-age=0',& q9 X2 g2 G7 U
- 'Connection':'keep-alive',, H. m. v: Q, u1 [& A* [( H
- 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'" k: ~+ K# O. Y9 D& D2 ~7 _
- }) Z1 V/ C- R, l( {+ W: O4 w
- crawl_config = {" J. K; O- p; q Y
- 'headers' : headers,
. c, C/ D, E; T( n6 H1 E - 'timeout' : 300" a& z; p9 B0 G& Q
- }
2 I' [# u2 }9 V - def add_BookFile(self,Bookname, BookIDs, img, Locaimg, Book_Dates):# A: M6 H* M. ~9 j* A7 o, w& L
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8"). j0 c0 Z+ |1 T/ t1 Y
- try:* P3 p$ I1 Q% G1 m1 {4 U
- cursor = db.cursor()
1 w( q$ b6 M+ r! D - #注意此处字符串的占位符要加双引号"%s"
# d: J+ H f8 I# ?* m - sql = 'insert into BookFile(Bookname, BookID, img, Locaimg, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, BookIDs, img, Locaimg, Book_Dates);
! G ~) w" d U1 M - # print(sql)% Q" D9 q4 i& K2 `0 T
- cursor.execute(sql)) _% {( f( A5 k) G6 [' k
-
7 {3 }4 S9 c2 K4 q* L - #qid = cursor.lastrowid& L# D2 [9 t/ P, b
- #print(qid). ~( B! f2 z0 ~
- " D" c0 B9 p0 K! j O7 E1 R
- db.commit()
0 [; v% v1 d. W9 k! H- c& ]& e - except Exception as err:
W- d) L/ L, {# j - print("Error %s for execute sql: %s" % (err, sql))
! S# }5 q+ ?& k+ C- r8 _: ~ - db.rollback()
3 Z" ^" ]$ i4 c7 Z* h4 M - def add_comment(self,Bookname, Booktitle, BookID, Titleid, Book_Date):
8 l% {1 c0 s& z9 c+ s - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")( O& ]/ o, N6 ]6 W$ B
- try:- Z# ]8 O: z8 {$ R" H. L
- cursor = db.cursor()
$ M, F, b+ n" Y - #注意此处字符串的占位符要加双引号"%s"
. Y" x6 {; S2 M1 I - sql = 'insert into BookTitle(Bookname, Booktitle, BookID, Titleid, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, Booktitle, BookID, Titleid, Book_Date);9 U0 q$ B0 |7 z: n/ @
- # print(sql)
, C; Z g! d6 i: @ - cursor.execute(sql)
0 @5 {9 p2 @4 l }1 p4 n5 F' O - ; F4 D/ q3 B; X' s
- #qid = cursor.lastrowid
) I% m) e) E I - #print(qid)6 r3 N6 g( L+ R8 Z: U2 v `
-
) c5 |: n) ?! a- c2 i - db.commit()
; g, G4 m' A# |1 @+ W; y - except Exception as err:7 k# o$ W1 [7 Z: s7 w3 b; e6 e
- print("Error %s for execute sql: %s" % (err, sql))+ a e4 H# q! T0 y
- db.rollback() i3 x; t% M& Z, F, H+ b) ?
- def add_question(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date):, a. P. J8 s' {3 q
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
/ @4 A6 w0 _& u/ C0 W' V - try:8 ~5 o) `! T4 K5 e6 r
- cursor = db.cursor()- _/ b* Q; b( r) `8 b! Q6 [/ N3 m
- #注意此处字符串的占位符要加双引号"%s"
$ h' V1 K1 t0 _/ E# L' P - sql = ' INSERT INTO BookConte (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date);
9 @" r9 t) w8 L, W- e - print(sql)
0 F" b4 _" s0 E3 T - cursor.execute(sql)/ s& s7 l& \) x0 @
- print(cursor.lastrowid)9 m8 [6 V5 R- P) F# ?) _
- db.commit()
5 L9 \/ A1 w) U8 _4 | - except Exception as err:
f) L+ l* t1 T6 A9 p1 O0 @0 N - # except:0 n/ ?2 E- [8 G( h7 r% _2 b q2 n) Z
- # print('Failed')
" |: s: j; O5 p- p. a - print("Error %s for execute sql: %s" % (err, sql))
4 g* \/ r6 e7 m6 W5 N& ?3 B - db.rollback()4 [+ E/ U6 s' t2 n
- 7 P4 {( { e F, Y( {4 @, d
- def add_locoy(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover): 0 i- p2 Z8 b5 M7 M! E' x
- reload(sys); S1 J: G8 E2 d X3 Z1 `4 ~
- sys.setdefaultencoding("gbk")
& u/ c( r0 e) @) S2 R6 O - locoy_url = 'http://www.******.net/locoy/?my=book' #697火车头发接口地址
4 J6 ?* k3 a. V ^, D$ s - locoy_data = {
* X( m+ [' C/ _7 K4 q1 U - 'my_u':'用户名', #后台用户名
4 U) Y1 T2 C6 C5 M - 'my_p':'密码', #后台密码$ x6 {) f+ y' f2 J
- 'subject_669977_net':Bookname.encode('gbk', 'ignore'),
4 w* d( \: W+ f: |# x0 {1 f - 'caid':Cater_Name.encode('gbk', 'ignore'),
* l) A C# Q- t6 z - 'title_669977_net':Booktitle.encode('gbk', 'ignore'),) _. ^" W1 e; ^6 h ?
- 'article':BookConte.encode('gbk', 'ignore'), r" Y' U) t1 J- w
- 'author':Book_author.encode('gbk', 'ignore'),5 [+ o: e4 |+ g9 P$ D
- 'ready_1':Book_Palabras.encode('gbk', 'ignore'),
$ ^, M# g' @) @* R( |& v - 'thumb':Book_img,8 ]; v" Q7 f- X9 L5 U- P( Y
- 'content':Book_Introduction.encode('gbk', 'ignore'),( \3 G6 C( Y' s$ ^
- 'abover':abover.encode('gbk', 'ignore')
% J$ M0 \8 e7 M8 V- O- g/ h) ` - }
K: \! _* J: \6 A( T" U& q9 q - res = requests.post(locoy_url, data=locoy_data)
5 e2 ?5 `1 O& k% {) F8 a8 K - print res.text
4 l5 b( Y% }0 K - print res.content
& \* T0 k f* _1 i* V6 @; n* E# Q - # print Dsd4 x& Q: D8 P' ~8 ^" ~1 P& D
- return res
7 X& C. o; u& ~0 r; ^ - 7 T( n5 e# N% H) U: q
- def __init__(self):0 r3 p/ p: o1 B4 O. f; y
- self.base_url1 = 'https://www.****.cc/'
* z: J& u+ v o' Z- B" n; ] - self.base_url2 = '/'
1 a; ]& H( K2 E/ ~! A. x$ E - self.CaterId = []
, b- s) q& y1 |( o" w$ D4 J5 T - self.CaterIds = ['xuanhuan', 'wuxia', 'yanqing', 'lishi', 'kehuan', 'kongbu', 'nvsheng']' H2 Y K1 j2 D, S& ]6 s, W9 z
- self.page_num = 1) l* I5 H- M+ J! D2 Y
- self.total_num = 200
6 ? B; v+ ]( k* v4 h& R/ V3 V1 V - # p2 r2 a7 V W, Q1 i3 [
- @every(minutes=8 * 60): [! K) ~0 Y. }* p0 x
- def on_start(self):
9 n3 `5 F2 y( N; e1 O2 g! Q - global Cater_Name. I; r7 o) \) y$ } P6 x4 X
- Cater_Name = []. y( M3 S; Q' p" z0 g. J
- while self.page_num <= self.total_num: ) m5 [0 o7 t5 j; H7 O
- for self.CaterId in self.CaterIds:2 j3 H5 I3 c' B0 w) [; K; L
- if self.CaterId == 'xuanhuan':! ]; X$ ]8 N* T; p: I
- Cater_Name = '玄幻'% F$ ^7 N7 b8 W" t7 f3 [
- if self.CaterId == 'wuxia':0 y* G" r. C8 g+ p. u2 Z, D/ _
- Cater_Name = '武侠'
; g9 Y: B) b! s- b7 v - if self.CaterId == 'lishi':2 a- k. ~4 U, }: Z1 ?. Q5 w( e: x
- Cater_Name = '历史' , |( R( i! C6 x0 n: M
- if self.CaterId == 'yanqing':# A5 |0 @* {; M7 e, R9 d: i4 a
- Cater_Name = '都市' ( Q0 f/ m# |& S+ m
- if self.CaterId == 'nvsheng':- q% t' w& c5 n( @- r
- Cater_Name = '都市'
* @- u$ J- s% |3 S9 [/ {9 O - if self.CaterId == 'kehuan':; Q+ w2 m5 r3 |% r/ [" D
- Cater_Name = '科幻' 0 q& H; }$ k1 |' ?
- if self.CaterId == 'kongbu':% B2 A0 L3 S6 p, `& Q. a, n6 T
- Cater_Name = '游戏' % Z* ]* K0 I+ q5 i
- print self.CaterId
$ Q4 E3 e+ @$ C: a - url = self.base_url1 + str(self.CaterId) + self.base_url2 + str(self.page_num) + "/"
& F8 _2 ^. r, T. e; O: R( t - self.crawl(url, callback=self.list_Caterg,save=Cater_Name)
! {$ I/ m: F9 w& U! A9 u/ v4 P - self.page_num += 1 ' @' l" E7 u+ T, w; H& ]- K0 J
- 7 [0 q! q% C! I! a2 Y
- def list_Caterg(self, response):! s7 ?# f) c& L3 Z( h9 c; d c/ T
- Cater_Name = response.save
, H Z+ q8 @; a- `0 `. H4 W - for each in response.doc('.pic-list a[href^="http"]').items():! X1 v) n1 I8 A
- self.crawl(each.attr.href, callback=self.list_Caterg_detail,save=Cater_Name)5 l0 @2 T& l* M8 t' }
-
/ g$ s" z. t7 ?' h - def list_Caterg_detail(self, response):& _2 i$ N6 O5 M$ |
- Cater_Name = response.save
- J$ |" _( Z" S' n# _0 f, i n( Q2 W2 E - # print Cater_Name! n+ l3 \9 ~. Q
- Bookname = response.doc('h1').text()0 p9 P- Y. B8 ~8 l
- print Bookname* b( Y3 ^% s, G; a9 [& v4 H3 n
- Book_author = response.doc('.authorname > a').text()% Z, L1 r; _, d5 p
- # print Book_author
0 s2 z4 Q& h5 m; \' R - Book_Introduction = response.doc('.book-intro > div').text()
3 Q% W" Z! a& `9 \ - # print Book_Introduction
; f2 N! F+ w" ?8 y5 j5 O( D, }' ~ - Book_Synopsis = response.doc('b').eq(1).text()
3 z+ P( v& M- T5 ]7 Z' B% E - # print Book_Synopsis
# e7 V J" y# X; l, X - Book_Palabras = response.doc('.booktitle p').text().split(' ')[1].split('|')[0]
; z; n" F& \# \% ?2 c7 p - # print Book_Palabras* E# D3 l; k, o8 w& i3 x/ o
- BookIDs = response.url.split("xiaoshuo/")[-1].split("/")[0] #小说ID
4 F6 t1 I9 ]8 p% @' Z; d' w - # print BookIDs! d# `, C, K; N
- Book_Dates = str(datetime.datetime.now())
: b0 W6 ^3 P4 u3 q - for imgs in response.doc('.bigpic > img[src^="http"]').items():1 n' B9 }' q; Z
- img = imgs.attr.src. h. e$ o# T/ o. U# }1 g9 r: _$ k
- print img
8 ^& O8 H, j, b: J - #小说封面下载
2 A* j) B+ I! `4 M5 S& K- B N - extension = self.getExtension(img)9 X6 [: g G6 p& N, \
- name = self.getname(img)( u# y( Z Q# r5 M& v8 B
- file_name = name + "." + extension3 }. N3 T X' @/ \- u" r' e
- imgDir = P_dir + name
% W( o8 s& E' g6 ?1 } - Locaimg = imgDir + "/" + file_name
4 `' _) n( G% [/ V - print Locaimg
( ^8 v" n* P; F S, P6 j1 P - if(self.download(P_dir, imgDir, file_name, img)): #这2行可注译,图片下载到本地9 v$ U9 ?& R; {% n7 t6 g
- print('attachment url is ' + img) #
& O6 |" K. D( ?* g/ q - Datos = {
6 Z7 Y% A: Y& E/ v& M - "Cater_Name":Cater_Name,) ^( o' D2 \) u/ I. }
- "Book_author":Book_author,
5 h# _% R \. J) @ - "Book_Introduction":Book_Introduction,
7 [ f j. u+ g) m8 a% L C - "Book_Synopsis":Book_Synopsis,
?3 n; A7 R- ^, b+ B( E; D - "Book_Palabras":Book_Palabras,' i( C$ _% s$ f7 ^+ k
- "img":img,
. u* B" J9 o3 z: F6 W: ^, O4 @' D9 { - }
. D0 _8 k$ U" S' i* w - self.add_BookFile(Bookname, BookIDs, img, Locaimg, Book_Dates) #这行可注译,数据库发布接口,方便其他系统的发布
5 F9 F2 B% A. F - for each in response.doc('div[class="bookbtn-txt"] a[class="catalogbtn"]').items():
4 q0 c& u+ ~1 } Z+ i9 j/ T% _4 g - self.crawl(each.attr.href, callback=self.index_page,save=Datos)
3 ^" M- g% O+ B4 _" C - 4 I) U4 V0 T: ~1 u8 I
- @config(age=8 * 60 * 60)
5 d" k& w. e" C8 I$ F' g' B - def index_page(self, response):
% p* `# M ]8 o# ~# D5 o - Datos = {
9 H9 F- |- Q7 W( F) w - "Cater_Name":response.save['Cater_Name'],
& M1 S* ]" T# K- R/ v - "Book_author":response.save['Book_author'],, g: j# y" j2 j7 X( `1 @8 a
- "Book_Introduction":response.save['Book_Introduction'],
5 ?- Q% Q' I8 l# q+ X/ i - "Book_Synopsis":response.save['Book_Synopsis'],
5 q# W9 @- C6 F$ { - "Book_Palabras":response.save['Book_Palabras'],
+ i: W! }% ^, o/ L - "img":response.save['img'], _/ |0 ^% y$ n! k& c* P+ |
- }
3 m& T0 t, A$ _ ]* k( R - for each in response.doc('.chapter-list li:first-child a[href^="http"]').items():
( M. R( e* z5 l) q$ ]5 j# f4 d - # for each in response.doc('.chapter-list a[href^="http"]').items(): 4 B5 s6 k; \2 L6 J" r
- self.crawl(each.attr.href, callback=self.detail_page,save=Datos)3 E2 S% d7 M1 Q' m G
- @config(priority=2)9 V% Z- K' }# A: }9 w7 y7 M3 F/ X
- @catch_status_code_error
* K8 Y. Q" J( Z" d - def detail_page(self, response):
5 C: h! S$ j$ l3 y - NewRe1 = u'哈书'
0 Q3 n) T! _8 {: Y+ O" c, Z8 x - NewRe2 = u'huhjsd.CC'* e1 P/ w' L% m- L. T! N+ \- ~
- NewRe3 = r'^\\n\\n'
+ i- g: b* n7 S `4 V - NewRe5 = u'小说网'
* ^, f- E8 w z* T( c - NewRe6 = u'fgdfgf'& c+ `8 ]1 U8 n" L( L- k, _4 o$ I
- NewRe7 = u'fgfgf'. Y* ]; u2 ?6 [. D
- NewRe8 = u'ffhgf'
0 K% P, y" D- ?5 u- p4 y - NewRe4 = r'[\f\t\v+\.\{\(\)\}\!\/_,$%^*(+"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+'
) S$ U0 \) O. D& _* F9 U6 Z - ReC1 = u'静思'2 L9 q( r, T% c" \/ ~+ j8 {. t6 x
- ReC2 = u'aghgf.com'
! d) n9 L5 C$ D S - ReC3 = u'aghgfh.com'
& G4 o8 L0 V# }& u - ReC4 = u''
8 q8 i" K. z; h4 }8 S5 Y - ReC5 = u'文学网'& z3 a" P& }- g
- ReC6 = r'<BR>'
% E" o$ q) I# a$ Z' L, i - Bookname = response.doc('.readlocation a').eq(2).text() #小说名称
2 L" C g4 P5 f( U: r0 q2 E5 A5 a - print Bookname1 z/ O8 u4 X1 c4 a" Y; b* e
- Cater_Name = response.save['Cater_Name'] # 小说分类( W8 K7 l' k) s2 S1 i/ K6 {
- Book_author = response.save['Book_author'] #小说作者
6 P4 D) P8 `2 f% t* i8 h4 P7 I; K" d - Book_Introduction1 = response.save['Book_Introduction'] #小说简介
1 l# |& h6 Y0 I+ Z; z - Book_Synopsis = response.save['Book_Synopsis'] #最近更新3 c3 j$ f4 X. z
- Book_Palabras = response.save['Book_Palabras'] #小说字数8 A& r4 c9 J. r) u0 q
- Bookurl = response.url #小说网址: {9 ^! ~1 D6 E: ?* b0 A. N
- Booktitle = response.doc('.article-title').text() #章节名称. Z, q6 J# {7 L9 E% L# `4 @3 q; M) G
- BookID = response.doc('.readset-r span').text() #小说ID
3 ~# O8 S1 x4 c# L - BookConte1 = response.doc('.article-con').text() #小说章节内容( E5 R6 U& C9 H) ^6 x& Y, f$ X
- abover = response.doc('.article-title').text() + response.save['Book_Synopsis'] + response.save['Book_Palabras'] + response.save['Book_Introduction'] #小说状态(连载还是完成)" ~8 S R+ f- M6 ^& `" J3 }1 M
- Book_Date = str(datetime.datetime.now()) # 采集时间* p$ c v1 v( c+ j9 F! B$ V1 ?
- BookConte2 = BookConte1.replace(NewRe1 , ReC1), T4 X" W: ?3 j$ S4 r# r
- BookConte3 = BookConte2.replace(NewRe2 , ReC2)
) n/ m3 Y1 z1 K( Y% d - BookConte5 = BookConte3.replace(NewRe5 , ReC5); G& _( _1 F& E2 B
- BookConte6 = BookConte5.replace(NewRe6 , ReC2)
- @ S. X+ c$ L/ j0 l - BookConte7 = BookConte6.replace(NewRe7 , ReC2)# v) }# w- T. {8 g
- BookConte8 = BookConte7.replace(NewRe3 , ReC6)
5 u b6 u& i1 ] \( \ - BookConte4 = re.sub(NewRe4 , ReC4 , BookConte8) s# x3 {, h, A. m: K
- BookConte = BookConte4.replace("\n\n","<br>")2 P) l; D4 y" k$ L
- print BookConte: P7 @6 i/ D- Y
- Book_Introduction2 = Book_Introduction1.replace(NewRe1 , ReC1)9 ]/ |) p# S% K9 D
- Book_Introduction3 = Book_Introduction2.replace(NewRe2 , ReC2)4 B m1 y; J& W% Y6 B7 `! H/ o
- Book_Introduction4 = Book_Introduction3.replace(NewRe3 , ReC3)
$ @2 ?/ @" z& ~% ?+ ]. d8 h - Book_Introduction = re.sub(NewRe4 , ReC4 , Book_Introduction4)7 y6 Y) r5 J0 R p: O- D( \" n9 N8 V& e `
- Titleid = response.url.split(BookID + "/")[-1].split("/")[0] 4 ^% x& \7 J! V) b, A" S
- Book_img = response.save['img'], #小说图片5 c$ o- h! h6 ?% Y
- 8 o0 b d( ?0 n2 r3 I9 H! C
- #insert into MySQL 小说入库
3 h6 R+ x5 _7 M, m0 m8 |1 _, | - self.add_question(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布
& j- P {! P; r4 Z$ d+ P" ^: o$ c - self.add_comment(Bookname, Booktitle, BookID, Titleid, Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布
7 i% t. a! s. z7 z - #post提交发布$ C" ? I6 R a$ Z5 S# s/ D# B
- self.add_locoy(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover) #这行可注译,火车头发布接口,不需要可取消! [. c+ L; A9 G) V' ^( `3 _
- Datos = {
7 ? W$ e8 v) ]$ E" C( r) v - "Cater_Name":response.save['Cater_Name'],4 W2 O" ] K# M$ Q) M- ]' X
- "Book_author":response.save['Book_author'],8 ^9 n& V, R0 n9 s7 `( _& h) x
- "Book_Introduction":response.save['Book_Introduction'],) F; W2 |* o: e, V
- "Book_Synopsis":response.save['Book_Synopsis'],4 @" J3 h5 \+ r/ }
- "Book_Palabras":response.save['Book_Palabras'],
. e1 \) @* [6 J: G6 r$ y - "img":response.save['img'],6 \% [) t0 S% p9 f1 n# N
- }
# t+ V8 k7 p; j" } - for each in response.doc('.articlebtn > a:nth-child(4)[href*="/xiaoshuo"]').items():
( q' c+ ^& p2 B0 d5 F- Z* l) T - self.crawl(each.attr.href, callback=self.detail_page,save=Datos) * ^$ c! A& Y. i" K
- return {
3 b! P D, B9 m w% d; W0 e - "Cater_Name":Cater_Name,- M+ X) M9 e2 O1 @
- "Bookname":Bookname,6 l* L) r( ~: w5 v, P4 d
- "Book_author":Book_author,
5 M% d8 G6 U: z9 w1 D7 o: C - "Book_Introduction":Book_Introduction,1 q n8 n5 h- b6 \1 u& w7 {9 R. K
- "Book_Synopsis":Book_Synopsis,5 H0 ?) B" _' H i
- "Book_Palabras":Book_Palabras,, J/ }) f8 f! g
- "Book_img":Book_img,2 ?9 q, E0 X# i; O0 K! N
- "Bookurl": response.url,! X9 r1 }8 f, |5 J
- "Booktitle": Booktitle,
0 T% G" b) T1 Z7 {; d8 M - "BookID": BookID,
u* R# p" ~, h9 X* [5 f - "BookConte": BookConte,% ~, j8 W( l" i6 Y4 F" ?6 y; T" x
- "Titleid": Titleid,
3 d' @, w# y7 k7 i, Z - "abover":abover,
' Y5 O" L0 M8 r2 K5 ? - # "Book_Date" = str(datetime.datetime.now()),
) r3 p$ p! N7 i# l4 V# c* C - }- F2 R4 Z" @$ G. R$ X! u/ u
- def download(self, P_dir, imgDir, file_name, Book_img):
1 f, u3 h$ W! \8 ~ - if not os.path.exists(imgDir):
+ C/ p4 y) _6 ]4 ?0 e - os.makedirs(imgDir)
% w) y0 W- ]1 X* B - file = imgDir + "/" + file_name
. A3 E0 h2 \: a - # print file3 P0 s) {8 P8 M: w' ^9 x
- f = open(file, 'wb+')
8 m$ j6 I7 n8 ~1 N5 U7 {) J v - imag = requests.get(Book_img)
: U4 Q$ p2 Y4 E1 I$ D: Z( h - f.write(imag.content)
6 l* g& J) Q& V" A& s - f.close()
2 I: W: D4 E3 Q) Z, K2 n- g - #保存图片前* C, ]8 w% h4 a3 O: a
- def save_imgs(self,response):! D/ O0 ]: f K% L
- content = response.content
# P ~. A& a! @ - file_name = response.save["file_name"]7 O1 W8 L6 G6 }' M7 B+ \
- imgDir = response.save["imgDir"]8 @( r [# n/ W8 b1 z4 S2 X' [
- file_path = imgDir + file_name- h( f( B( R+ V, o ?: j7 W% O
- self.save_img(content,imgDir,file_path)0 y, w: t, D8 l! G8 }( i. S# V# r
- #保存图片" ]. O) v8 n% K! t3 c3 j
- def save_img(self,content,imgDir,path):' x) \' D3 q- d2 q" k* T- g7 |8 G7 Y' Y
- if not os.path.exists(imgDir):
! y2 P9 w. X4 c6 ]2 X4 T - os.makedirs(imgDir)
) J" N& I4 Z6 W6 I# x* ?# A - f = open(path,"wb" )
4 y- F) S; p6 E - f.write(content)( s2 s+ U6 J7 M6 C4 i( D* x" Q% j
- f.close()
5 [2 f1 T4 I/ R2 R/ s - #获取url后缀名4 g9 r5 J/ |9 ]" l
- def getExtension(self,url): % g% n- I/ m, j
- extension = url.split(".")[-1]
3 X" f8 `2 |5 ~ - return extension Q9 n: x, {$ }6 P/ c
-
# r. r! G4 \+ f! g- K+ M, T% J+ B- [ - #获取图片名, f/ }; g- f+ I, n4 N
- def getname(self,url):( L! ?: I& W$ Z$ e8 ~
- name=url.split("/")[-1].split(".")[0]$ z) W5 C# O) B; A5 h8 L* S, L: I; e
- return name
复制代码 5 [$ E: T' i8 U& V' z9 f- L- @
4 S6 U+ a) w& R$ \5 g1 p6 L8 Y
|