Python + pyspider某小说站的爬虫,入数据库,火车头发布,资源下载到本地,另可写爬虫!6 l* {6 p! F. G& O- M. O. ^
- #!/usr/bin/env python
' D. u$ j! q I) p7 } - # -*- encoding: utf-8 -*-; N* V; T: E5 \8 U2 `. R8 \8 {
- # Created on 2019-05-05 21:43:11
, H( k1 k0 u; h - # Project: XiaoShuo
. U3 X& K1 ?; I+ ]1 H/ b% I - 2 ^% {6 E; V* F5 F1 K' R
- from pyspider.libs.base_handler import *6 Q" O0 O, Z& l3 _2 H
- import pymysql
% |- ^4 g0 D" B) [ - import random
# l5 k: x0 w* w1 Y' F2 H" e$ K: J - import datetime2 e$ K1 A$ f. m M) K
- import urllib2,HTMLParser,re
7 e/ k3 h' V# V: p9 l - import os
- {9 E- [9 I( y - import sys8 z; _! k% i$ T: R4 z
- import re- O# M+ m# K. j; I" `! ?
- import codecs
8 L/ j; Z8 T6 O; {8 L - import requests
7 G- t/ E/ v& ^& V" ^ - import json
1 Y: ~6 c6 ]; Y) ~ -
9 m8 e/ U; F. Y3 M8 |8 q6 K/ z - class Handler(BaseHandler):1 w# N4 @& ^- E& }0 ^$ }0 s
- global Datos( Y8 e0 J7 s) X6 U% @, l5 t2 t& W
- global P_dir
9 I- R* [( Q# b2 H) } r% T& U - P_dir = '/Tools/Debug/' #采集时候图片保持到本地的路径 s7 Z, `7 [3 r
- global Datos
+ x/ m E- G, ~6 X - Datos = {}
; b: b. n- H7 _" V, q0 ^5 o, y - headers= {4 a- V+ |% x- s9 v; ^3 g c
- 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
1 ?8 L0 [6 g) E1 F; s - 'Accept-Encoding':'gzip, deflate, sdch',
3 \5 @& u7 a5 ^1 H* y1 h - 'Accept-Language':'zh-CN,zh;q=0.8',
6 m- z# X; a J% a9 m5 v* ]5 j! d) i4 { - 'Cache-Control':'max-age=0',
# f, j1 I2 U) w& ^5 K" H0 W* x' ] - 'Connection':'keep-alive',
: D; p e+ B( _ - 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36', z( Z& r6 l0 ?( Q2 C6 C! `( D. s! Y: a
- }
3 C& J% ^! f% u, P! [" J) T$ l8 h+ ] - crawl_config = {
$ ?* G" E7 K7 m8 W) w2 v1 w - 'headers' : headers,( B- j3 y' u3 D0 g
- 'timeout' : 3003 k" H4 g4 O; l
- }
: H5 _ _* M. {7 a1 n2 F" Q: Q, p$ R - def add_BookFile(self,Bookname, BookIDs, img, Locaimg, Book_Dates):
5 C- ]: N8 }5 T/ K6 g7 j - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
' n0 D( H/ N: [+ w! @$ M2 S7 D - try:4 B1 V, J. @ L
- cursor = db.cursor()
1 D3 W5 H! [: ?9 {7 S s0 ? - #注意此处字符串的占位符要加双引号"%s"" M: d' E' l9 D* Q9 T2 x; I
- sql = 'insert into BookFile(Bookname, BookID, img, Locaimg, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, BookIDs, img, Locaimg, Book_Dates);
! z# G" u5 U1 c' U* h" A, ~* u - # print(sql)
+ u5 O H5 Y) r4 ^1 N2 c3 i - cursor.execute(sql)
* h5 ]" [! R0 _! H4 N4 |/ t - ; V. B/ W: f% S3 u0 A' H1 z
- #qid = cursor.lastrowid. K5 \! @+ h3 q" p8 b
- #print(qid)0 b0 h3 m* R* N
- " h8 C/ C4 ^& `) i9 Y7 w! [
- db.commit()5 M: h6 M, W$ {" p. H v
- except Exception as err:
% A5 Z8 {' Y5 I4 d& v - print("Error %s for execute sql: %s" % (err, sql))! `+ m, m" R9 e
- db.rollback()3 N. G- @! g: c4 t( w
- def add_comment(self,Bookname, Booktitle, BookID, Titleid, Book_Date):; K* ?8 f; M9 H' s! r G; s
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")+ z& b( D" T" x! H$ A+ X
- try:
7 z X* Z" @6 _8 a3 q - cursor = db.cursor(); x: s1 e# I! N7 u3 ^
- #注意此处字符串的占位符要加双引号"%s"
5 S" ?" u. _: m* u+ ]# F3 _: g - sql = 'insert into BookTitle(Bookname, Booktitle, BookID, Titleid, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, Booktitle, BookID, Titleid, Book_Date);
6 }; K) K% {* ~ - # print(sql)
' M* r. g/ n3 Z9 ? D% ^/ C b; y - cursor.execute(sql)
7 |- |# j+ b3 x6 i& O+ s) O3 A -
; x3 t# O; C5 O" [# P+ k) W - #qid = cursor.lastrowid5 E c- \2 k. J2 P
- #print(qid)' v4 T) @0 X) k1 O- l8 F# |. p! W
- , m: G3 P7 h& J3 b8 P$ M0 z
- db.commit()
/ ^, [4 d0 f6 r9 O - except Exception as err:
) O0 @( y# V) \0 U1 K l$ Z9 g - print("Error %s for execute sql: %s" % (err, sql))
& s3 x& @4 y: t1 I0 K - db.rollback()4 c" Q+ Z2 b! y: @5 L
- def add_question(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date):, E* e `, J8 m5 K" b
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
" x$ F' T$ L0 x. U5 k% P! Z - try:- B U$ |( o5 I* C4 q
- cursor = db.cursor()
1 a# a3 U4 W" `$ z3 u - #注意此处字符串的占位符要加双引号"%s": e2 N$ ^) I: ~& u& K( D6 R
- sql = ' INSERT INTO BookConte (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date);
/ O+ I d7 F9 {: ^" j6 S/ j - print(sql)
$ A- F3 N% t; b" k# t& T- Q) | - cursor.execute(sql)
( l% J1 z* s9 f1 b+ F- O - print(cursor.lastrowid)5 y+ L' S; R- M/ L
- db.commit(); |$ J9 s6 J% |
- except Exception as err:& Y" {& s! M2 `
- # except:
( V9 v7 }+ X& I* j - # print('Failed')
' P6 o V- q; ^. D7 @; m4 l' } - print("Error %s for execute sql: %s" % (err, sql))
: N' [0 Z" _( o1 u+ z% E- T: g4 q - db.rollback(): |/ z- M! M$ }- k
-
/ g6 C& F( v& c( l9 `; s" @- f - def add_locoy(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover):
% P# S# n$ I# x R* l - reload(sys)( b- a* F9 U% r5 a
- sys.setdefaultencoding("gbk") I! ^ h, ]2 k/ q$ k% V
- locoy_url = 'http://www.******.net/locoy/?my=book' #697火车头发接口地址0 Q9 a! F# u2 |; R# p
- locoy_data = {) f: p* P) R B: S7 o( M
- 'my_u':'用户名', #后台用户名
6 _+ X) j1 W: F% V& L# h5 h- @ - 'my_p':'密码', #后台密码
6 Y! C" W7 T% I# h. Q - 'subject_669977_net':Bookname.encode('gbk', 'ignore'),
6 f" V& A, A9 [( W# n. e9 e: V2 g - 'caid':Cater_Name.encode('gbk', 'ignore'),
# U: R3 t2 C3 V9 U+ S4 x - 'title_669977_net':Booktitle.encode('gbk', 'ignore'),& X! _! Z# ]5 E5 D
- 'article':BookConte.encode('gbk', 'ignore'),
1 X6 X' z7 Y# B* Z- P) Z h - 'author':Book_author.encode('gbk', 'ignore'),
6 i( U( [! i& V; }; ^4 S - 'ready_1':Book_Palabras.encode('gbk', 'ignore'),
2 @! Y; J7 x/ j9 b: }8 w" T - 'thumb':Book_img,
- a/ _0 C7 [0 Z1 b - 'content':Book_Introduction.encode('gbk', 'ignore'),5 d# a! L$ F8 a p9 y7 \
- 'abover':abover.encode('gbk', 'ignore') ) ^# V# R8 j9 ~5 w3 h
- }# j7 V' D0 n# x2 s
- res = requests.post(locoy_url, data=locoy_data)
# p1 t, ]( c- w: o; r - print res.text$ [' D( m% `% b: ]! F# f. R( ^
- print res.content$ d2 Q( m; ~7 l; K# O5 g
- # print Dsd% U, Q k% V' o: w
- return res
+ ]' `& x. A( p) q -
& [. B, |! g9 h( W - def __init__(self):
0 H& y+ v z$ ?) ~3 _% m - self.base_url1 = 'https://www.****.cc/'+ a! {% b3 M' i
- self.base_url2 = '/'3 S, v! c+ j! g9 \, a
- self.CaterId = []' m( h& R# C7 L5 S. G g8 u5 w
- self.CaterIds = ['xuanhuan', 'wuxia', 'yanqing', 'lishi', 'kehuan', 'kongbu', 'nvsheng']
4 [* k5 g3 v9 X2 w- e$ |2 X - self.page_num = 17 L( k( F; P/ d7 c
- self.total_num = 200
! |2 F9 T) |2 y: q - ( `# r* s. \; f% J+ h& V- Y
- @every(minutes=8 * 60)3 G& L3 v, z; v* U
- def on_start(self):
* P9 U% V( X" l* j - global Cater_Name
% m' ~" |& h! D; q( e/ T# d8 _ - Cater_Name = []) w# ?) I! G; z* s
- while self.page_num <= self.total_num: ) n6 a/ c, `- y' B% D' c
- for self.CaterId in self.CaterIds:
; q3 N2 R; V5 b P! O) C" x7 V- J - if self.CaterId == 'xuanhuan':: d3 f9 ]! X' v' \' Z) F
- Cater_Name = '玄幻'* [. W7 L9 ?" e) w+ M% t% z# ^
- if self.CaterId == 'wuxia':- o8 t9 z8 q, @
- Cater_Name = '武侠'
; i: K- B' o# X6 P" E3 U - if self.CaterId == 'lishi':
- z! h0 d7 z$ \: S' A - Cater_Name = '历史'
0 d5 {8 v- k- c# K - if self.CaterId == 'yanqing':
9 |. n4 `. c$ C7 Y - Cater_Name = '都市'
- h( R1 Q# m7 r - if self.CaterId == 'nvsheng':
o) R* _7 W7 d8 g9 ~! R) {$ n - Cater_Name = '都市' , r. ?8 N. z# L( l/ W" D
- if self.CaterId == 'kehuan':* k$ y$ x2 x# g9 y6 N, f
- Cater_Name = '科幻' , |/ M5 ~3 s( Y& `
- if self.CaterId == 'kongbu':; Y3 X! ~5 H* V1 N s1 \$ ?
- Cater_Name = '游戏' * @0 r2 T6 z* u+ K7 m' e
- print self.CaterId& y& B2 @$ m0 H
- url = self.base_url1 + str(self.CaterId) + self.base_url2 + str(self.page_num) + "/"
* o) y5 r. P- s5 z - self.crawl(url, callback=self.list_Caterg,save=Cater_Name)+ Z9 @6 X2 D1 a* t
- self.page_num += 1
- A# h' [7 V/ _: h" A9 L. ? -
$ o7 |9 o4 N3 T* D - def list_Caterg(self, response):
+ B9 ~# b$ ^5 F - Cater_Name = response.save* l, |8 L; C3 {# q
- for each in response.doc('.pic-list a[href^="http"]').items():* x/ P" ]! {7 z
- self.crawl(each.attr.href, callback=self.list_Caterg_detail,save=Cater_Name)
0 t$ V) z2 _& _! [8 ? - - H' [ ^5 g3 v# q6 t2 I: x% [
- def list_Caterg_detail(self, response):( J7 ~$ x* U4 Q2 F: o* X
- Cater_Name = response.save
6 A! @3 m4 R, Y, O p - # print Cater_Name# {& V6 _( a; D8 S% c+ z7 \
- Bookname = response.doc('h1').text()/ U( m" ?) N. N: h$ {
- print Bookname5 W% V7 c. X1 S6 z, ^: x
- Book_author = response.doc('.authorname > a').text()
. e7 p4 `8 B9 z1 m" }/ t: H0 T0 Q0 j - # print Book_author
/ T7 F. D; O- L9 C - Book_Introduction = response.doc('.book-intro > div').text()/ w1 f1 a9 ~* S, f
- # print Book_Introduction5 W% z% W- ?: \7 n# u! f
- Book_Synopsis = response.doc('b').eq(1).text()6 O( _* d3 H+ N9 {& X U
- # print Book_Synopsis% e. D1 Q G% y+ i1 r- `# [
- Book_Palabras = response.doc('.booktitle p').text().split(' ')[1].split('|')[0]3 P3 a5 O2 e) u9 L
- # print Book_Palabras
& ^2 m, x6 W% a) e Y8 v% a - BookIDs = response.url.split("xiaoshuo/")[-1].split("/")[0] #小说ID0 q+ U1 w( L1 K# M* R4 ^
- # print BookIDs1 {" n) w' ^4 ~. c5 A- J/ P
- Book_Dates = str(datetime.datetime.now()) : W8 d( g I! t+ }( A c
- for imgs in response.doc('.bigpic > img[src^="http"]').items():5 y+ i7 {0 E! Y% ^
- img = imgs.attr.src
; K V7 m; J8 h& I; L9 L - print img4 O, q* ~( C3 }3 W. R& E
- #小说封面下载8 ~9 Q& B- j3 B" ~
- extension = self.getExtension(img)% i& j5 W& n' Z1 J3 P- Z6 S
- name = self.getname(img)$ I3 E7 }% ^2 p% G# [5 N
- file_name = name + "." + extension9 @5 K" n; q3 {; f8 n Y- t- X
- imgDir = P_dir + name
' K" |8 l8 m3 Q& S6 j - Locaimg = imgDir + "/" + file_name# {! | | n+ ]2 b) ~$ a7 K! p/ K
- print Locaimg+ ]$ U; `% F0 L7 K
- if(self.download(P_dir, imgDir, file_name, img)): #这2行可注译,图片下载到本地% A; a( r! p3 O [) B
- print('attachment url is ' + img) #
9 x6 V& d5 A. q3 w% N - Datos = {
% u2 U+ e6 b2 i/ T* X/ S7 ] - "Cater_Name":Cater_Name,
" z# V1 C6 l; {( A5 c - "Book_author":Book_author,' b4 b- n" \: N) i% ~# ^/ _
- "Book_Introduction":Book_Introduction,
# Y' \9 c5 {" a, s3 g& J - "Book_Synopsis":Book_Synopsis,
: g4 v$ x( ]# F0 J1 K3 H+ p' i; l+ d - "Book_Palabras":Book_Palabras,, U+ X& M. [: ]: `
- "img":img,/ G" Q7 y, N' Q$ L( m q6 p
- }8 r! M% u6 d6 r- f
- self.add_BookFile(Bookname, BookIDs, img, Locaimg, Book_Dates) #这行可注译,数据库发布接口,方便其他系统的发布- c5 Z. B" N0 G
- for each in response.doc('div[class="bookbtn-txt"] a[class="catalogbtn"]').items():
/ `/ ^ ?6 G+ o X6 w, }+ r% t - self.crawl(each.attr.href, callback=self.index_page,save=Datos)
8 m% p7 ], p. r; W( j5 w: ~+ Q -
; B# a- i5 `3 X/ c - @config(age=8 * 60 * 60)
+ T ~: u W* i7 i0 T! j - def index_page(self, response): 9 D* \( C- T4 h
- Datos = {
$ i; h d/ F7 T- `3 {6 p) q) v - "Cater_Name":response.save['Cater_Name'],$ u9 y5 w- K( ~# {" o6 i
- "Book_author":response.save['Book_author'],: g5 n8 T/ y( X! q4 | h! i
- "Book_Introduction":response.save['Book_Introduction'],
. x$ A* j$ m4 m( B8 A d# y0 d) ? - "Book_Synopsis":response.save['Book_Synopsis'],
5 U* ]6 ?) z! T* L7 F; } - "Book_Palabras":response.save['Book_Palabras'],
# ^( _6 b8 L8 g' B# e - "img":response.save['img'],
. a3 j* f" d- ^. h' q. N - }
- F+ r/ |) Z& z. w: p7 h - for each in response.doc('.chapter-list li:first-child a[href^="http"]').items():" r2 x5 V+ V: H- G4 x
- # for each in response.doc('.chapter-list a[href^="http"]').items(): . x; Y* E" V' Y, _1 V
- self.crawl(each.attr.href, callback=self.detail_page,save=Datos)
; I. H6 h3 E. W9 }8 S - @config(priority=2)2 O- o8 C8 d" u5 a' @, C5 V; o% h a5 d
- @catch_status_code_error- i+ H0 E; E1 J& V4 E/ C( b
- def detail_page(self, response):
' t% ]" J& @' n$ ~* F - NewRe1 = u'哈书'
3 q/ z" J6 R' \7 i - NewRe2 = u'huhjsd.CC'/ N2 n. M o- f: n A5 Y
- NewRe3 = r'^\\n\\n'/ f5 w2 q$ J5 _1 s1 Z2 t; g8 q- Y7 |
- NewRe5 = u'小说网'
& H0 g7 F& c1 d M7 {' ^ - NewRe6 = u'fgdfgf'1 {2 ]0 W5 `$ Q+ f( ]) Z
- NewRe7 = u'fgfgf'- _0 ]1 E0 c2 l& n' p
- NewRe8 = u'ffhgf'" e; i& k( Y- W' j
- NewRe4 = r'[\f\t\v+\.\{\(\)\}\!\/_,$%^*(+"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+'
8 F* s' N6 s- j( x - ReC1 = u'静思'
4 W! i% G& P* ?8 s2 U* A - ReC2 = u'aghgf.com'
5 Y& c+ T4 z* r- \3 A) Q1 [ - ReC3 = u'aghgfh.com'+ G0 S9 W' u) g9 [+ d4 X
- ReC4 = u''/ g# { b3 ~) j: ?( H# r" S
- ReC5 = u'文学网'3 S! z7 q' V3 [+ ?, S' l2 I
- ReC6 = r'<BR>'! T1 A% P4 @8 a' G& \5 c( E9 k
- Bookname = response.doc('.readlocation a').eq(2).text() #小说名称
$ ?) v; G+ d' [/ \6 L - print Bookname7 I/ d; R5 M2 K8 y
- Cater_Name = response.save['Cater_Name'] # 小说分类5 D( [$ k% i9 i) N# b
- Book_author = response.save['Book_author'] #小说作者1 s" J1 X: _4 B7 R0 t6 h5 J' G6 [
- Book_Introduction1 = response.save['Book_Introduction'] #小说简介" }+ K5 x8 w3 Z$ g# p; N$ J
- Book_Synopsis = response.save['Book_Synopsis'] #最近更新: c: }9 |8 D1 E/ ~; ~+ {7 x
- Book_Palabras = response.save['Book_Palabras'] #小说字数* {( Y3 N u3 I# ?6 b+ \4 E4 W
- Bookurl = response.url #小说网址
8 H( O" X: L/ d. i9 b - Booktitle = response.doc('.article-title').text() #章节名称; o, o& K2 R6 c* o
- BookID = response.doc('.readset-r span').text() #小说ID. z5 R7 J' f. R @# j% |8 x
- BookConte1 = response.doc('.article-con').text() #小说章节内容
, g/ b" W/ P4 S: \- [, d2 h# l% J - abover = response.doc('.article-title').text() + response.save['Book_Synopsis'] + response.save['Book_Palabras'] + response.save['Book_Introduction'] #小说状态(连载还是完成)2 P q, F, y! B. u7 Q: ]
- Book_Date = str(datetime.datetime.now()) # 采集时间- F: o2 f# G, d8 k+ y/ r' s
- BookConte2 = BookConte1.replace(NewRe1 , ReC1)1 e2 j) `) K. y8 a1 F
- BookConte3 = BookConte2.replace(NewRe2 , ReC2)
F8 E% v% m! C) f" \4 z; t - BookConte5 = BookConte3.replace(NewRe5 , ReC5)* a7 @- e7 L; l, m
- BookConte6 = BookConte5.replace(NewRe6 , ReC2)
' x' N( Z: ? k% Q8 q8 p - BookConte7 = BookConte6.replace(NewRe7 , ReC2)) f3 k( ^+ k& S* O' W8 U
- BookConte8 = BookConte7.replace(NewRe3 , ReC6)% ^* F' N1 ^6 I5 k) f
- BookConte4 = re.sub(NewRe4 , ReC4 , BookConte8)
! z, G. Y" m6 x( f9 D; |) B - BookConte = BookConte4.replace("\n\n","<br>")# R- y, W1 H- e+ a6 E: s
- print BookConte
, m; r; C$ z1 q8 `7 j/ Q; ^- @. U - Book_Introduction2 = Book_Introduction1.replace(NewRe1 , ReC1)
# z% g. Q& i2 C" A( C ^ - Book_Introduction3 = Book_Introduction2.replace(NewRe2 , ReC2)% j# A8 p& ~+ w
- Book_Introduction4 = Book_Introduction3.replace(NewRe3 , ReC3)* j5 W9 u; Y# O+ ^" d
- Book_Introduction = re.sub(NewRe4 , ReC4 , Book_Introduction4)) U6 w% r# D4 J4 k& `
- Titleid = response.url.split(BookID + "/")[-1].split("/")[0]
1 a* B. a% _, _. c& k$ ^+ o i: i - Book_img = response.save['img'], #小说图片
$ S7 @2 N: U/ ^ F - , o: w. R3 g. }: ^. O' I1 u
- #insert into MySQL 小说入库% S9 Z8 q2 V( b+ m) N
- self.add_question(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布
4 C ^& o$ a7 X. x3 f# u9 F - self.add_comment(Bookname, Booktitle, BookID, Titleid, Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布; [5 }. r- [( W: D
- #post提交发布( h( T' ?9 O' I
- self.add_locoy(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover) #这行可注译,火车头发布接口,不需要可取消
0 B+ a% V( z; P3 b- [ - Datos = {
) q' x% V& o. I$ x1 l5 S3 y. w - "Cater_Name":response.save['Cater_Name'],7 {2 I! [1 o S9 R& `( g
- "Book_author":response.save['Book_author'],
* G6 e8 e @+ T' _2 D - "Book_Introduction":response.save['Book_Introduction'],
. b8 C% h) \: y$ C9 u3 Q" t. v- j" Z - "Book_Synopsis":response.save['Book_Synopsis'],
8 O, l% }% ?* O' z i - "Book_Palabras":response.save['Book_Palabras'],8 D# ^. E0 o9 {, \0 E
- "img":response.save['img'],
! {# Z# A/ X9 }& C1 Q7 j - }
# p2 F' P/ a. l9 L/ _ - for each in response.doc('.articlebtn > a:nth-child(4)[href*="/xiaoshuo"]').items():
8 ^# h, P m! b% W/ |, Z - self.crawl(each.attr.href, callback=self.detail_page,save=Datos)
- J3 [ k; z8 Y2 G; o - return {% X1 d' X% [9 m
- "Cater_Name":Cater_Name,
|$ z5 g& X! F( E7 u; w) ^ - "Bookname":Bookname,
$ T- J- ~# M9 N# C. L - "Book_author":Book_author,
2 p; l5 R' T& w* x* Y% ^ - "Book_Introduction":Book_Introduction,
' [/ ?. ~- f$ b - "Book_Synopsis":Book_Synopsis,
7 w. O6 r0 B6 Z5 `+ v - "Book_Palabras":Book_Palabras,( A( k a( t' R
- "Book_img":Book_img,
8 n3 a3 m% o0 o1 ^5 s- [$ h8 ? - "Bookurl": response.url,
/ w; l6 \4 H0 y- k. _. n( i" H - "Booktitle": Booktitle,3 H. G+ H+ C0 _0 A* u+ \
- "BookID": BookID,
. K" d- p9 b& p0 O7 d - "BookConte": BookConte,
' E4 u& U! ]: Q b - "Titleid": Titleid," C3 l1 e& u' B) }9 y7 n
- "abover":abover,- |) u* w7 W7 }+ U& w
- # "Book_Date" = str(datetime.datetime.now()),
4 ]# t0 ~( a* B+ z4 @# m m7 h- n - }% e Y: E! @( T9 P
- def download(self, P_dir, imgDir, file_name, Book_img):$ M$ W: I, B: @
- if not os.path.exists(imgDir): ; l) Z' l& m; J2 {! |! V. M
- os.makedirs(imgDir)7 ~1 f( E9 _* a% B: b6 F' p! u$ p
- file = imgDir + "/" + file_name M7 m( d1 H4 _
- # print file
, h3 G4 A% Q. T' z4 z' E h - f = open(file, 'wb+')2 x" P9 \: U. X) B/ x5 ~$ ~( v5 j
- imag = requests.get(Book_img) & `8 z( i' J' E" O( N; B
- f.write(imag.content)+ B5 W' z- O, Z" y6 }7 o2 X; X8 H
- f.close()2 F8 Y' H2 z9 G4 g" V
- #保存图片前8 }* d% c) h% {9 Q1 ^
- def save_imgs(self,response):3 k6 l% a! v- q5 a+ l
- content = response.content
* f5 W3 {* `( m - file_name = response.save["file_name"]- g& [; r# q" R( P9 s' c: b3 G1 ^2 I% @
- imgDir = response.save["imgDir"]
( X" \5 a+ u( A2 k9 }2 j% f - file_path = imgDir + file_name
, A5 q& z( i9 y+ W - self.save_img(content,imgDir,file_path)" h" Y( u' g$ Q+ T- E
- #保存图片! i' T) O) i+ V) b$ P, ?1 t
- def save_img(self,content,imgDir,path):9 t7 w4 }5 P1 f& p t2 g7 I; ?
- if not os.path.exists(imgDir): , g3 _0 V; g: f, w
- os.makedirs(imgDir)
2 f; E# T9 _4 T' q2 Q* z - f = open(path,"wb" )! w3 Z/ h, X9 w& n/ u |
- f.write(content)" n ` ^& w% ~; n
- f.close()$ Z, j& O% _* {% ^$ w; ?
- #获取url后缀名
; N7 M2 N, X- |6 Z$ L" x3 N: B* l - def getExtension(self,url):
6 V$ n' I+ O0 R! g% ]3 j8 x: } - extension = url.split(".")[-1]
3 S# `0 R/ e" ~ l3 r s - return extension
/ O* h" Z3 U' U0 d' @ -
" K! `$ b2 Q9 S2 S1 h - #获取图片名3 d7 U- S, @, z
- def getname(self,url):4 F" u; n4 i. F
- name=url.split("/")[-1].split(".")[0]
* }7 K: y3 Y9 L2 e9 Q* C - return name
复制代码
) ] j2 R3 h: r: ^$ Z 4 G1 w7 O9 s. j
|