Python + pyspider某小说站的爬虫,入数据库,火车头发布,资源下载到本地,另可写爬虫!) [3 T) d( R. W- f
- #!/usr/bin/env python r, e- u* [1 l- |
- # -*- encoding: utf-8 -*-
) c7 ~3 D% I8 R2 ` - # Created on 2019-05-05 21:43:11
. \0 Y6 X/ J' j2 C) H - # Project: XiaoShuo& t5 u9 P7 B, w! d( K1 k
-
: o% m1 P3 d: Q& M2 W% J6 e* A - from pyspider.libs.base_handler import *: U! k# k2 {* F, Z
- import pymysql0 Z+ O6 J4 ? K0 f& @) Y
- import random
2 G' I" P# M7 \. |& \ - import datetime
2 Z4 o. ^7 E- ]9 Y' t. \- W! c& E - import urllib2,HTMLParser,re
, g# P' V: C$ o `2 b& ? I- S - import os
& A, x3 G3 T% l - import sys4 M# B( P1 t3 b5 s
- import re
0 N( [7 U4 B+ K6 _8 w2 T" E - import codecs5 R# q4 d7 o) U; E3 X. B+ p. I- ]! O
- import requests# V" z( c" t, a- Z
- import json4 H, ~; Y6 N/ ]8 @1 Q) P% B
-
, R7 |% h6 o! A& J# T) L - class Handler(BaseHandler):
. ?: \. A$ i8 T - global Datos
3 [0 J2 d) F9 i - global P_dir ' ^4 r" c( s' n a3 g1 i7 o2 P
- P_dir = '/Tools/Debug/' #采集时候图片保持到本地的路径7 s& _' F9 a! c! P L6 D
- global Datos' f# Z) Y) N% }: [5 i8 g; n
- Datos = {}
! N( l' J, n/ ?9 Z5 } - headers= {: U! a+ V! }5 R6 O% x2 Q L
- 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',1 N% j" R \" ]5 k+ V
- 'Accept-Encoding':'gzip, deflate, sdch',+ U$ c+ e8 C/ t3 H2 g
- 'Accept-Language':'zh-CN,zh;q=0.8',$ s4 _0 e) C! E9 @& k" m) J- j
- 'Cache-Control':'max-age=0',$ c9 `0 n* |) G2 }6 g# B
- 'Connection':'keep-alive',6 W& g' s( A2 t: A6 m2 w) e# y w
- 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'
U: P' X9 n# ^% D2 v- t. y2 \ - }
$ n, h0 ?) ~+ H, |/ G3 a7 G - crawl_config = {7 Y& B9 g4 j, [' w
- 'headers' : headers,# y: ]! y1 [" y t" D. F. G7 [
- 'timeout' : 300
& ]! S8 f% s7 s* `. p - }
4 T! d. ~2 I b - def add_BookFile(self,Bookname, BookIDs, img, Locaimg, Book_Dates):8 c! t( s* Y# F' W- o Y5 V* Q, H# [0 t
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
+ q K6 y' o3 @ - try:
: J( T- t$ W$ I2 t9 \$ _ - cursor = db.cursor(): ~$ ~6 b/ ?0 S9 N" f
- #注意此处字符串的占位符要加双引号"%s"0 z1 V$ _+ C: R+ O' R) ^ z
- sql = 'insert into BookFile(Bookname, BookID, img, Locaimg, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, BookIDs, img, Locaimg, Book_Dates);% V0 n9 g5 |5 T
- # print(sql)* W$ \3 z# O$ u( `2 Z) T
- cursor.execute(sql)
6 D9 K& y4 P' d4 g- r D - 2 H: n e+ u2 G
- #qid = cursor.lastrowid
) J: I% b8 w' r" Q* i - #print(qid)' F4 d$ Z2 j! M# p! c/ J
- " \) V( w! Z9 @9 y) n
- db.commit()2 H& N1 `* t& ^/ u
- except Exception as err:
( _( Q0 n c* w- ]8 a - print("Error %s for execute sql: %s" % (err, sql))) \: @! L ` m. k
- db.rollback()
) M3 R- S8 }" s) G1 L. n' u - def add_comment(self,Bookname, Booktitle, BookID, Titleid, Book_Date):. O8 W) ~ Z7 [: _6 e0 H
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8") I* _) z8 u) _8 w6 f
- try:
8 X1 s ^5 B) o6 U2 Z' ~7 e6 ?- { - cursor = db.cursor()" T: z- C) ]) Z6 i8 H Q6 x2 @
- #注意此处字符串的占位符要加双引号"%s"
# o3 Z, G/ z& M8 T3 l! }0 l; M - sql = 'insert into BookTitle(Bookname, Booktitle, BookID, Titleid, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, Booktitle, BookID, Titleid, Book_Date);
% K& R: L4 J1 _/ }. k - # print(sql)
' a& Y1 ^, T6 ^" ?: q2 J - cursor.execute(sql): h, v/ j# Y4 P6 G
- ' D l, j1 z- j, w4 i
- #qid = cursor.lastrowid
3 f- |6 ~- |* C; j% x) V - #print(qid)
( ?) X1 [; O5 I0 f" P* s - 2 ?; |2 I8 H, L; g# m$ J& ^3 ^+ Z. V
- db.commit()7 z; [1 b* e- m8 y% A. ? O
- except Exception as err:: G9 @" h6 r: V/ s( X( L
- print("Error %s for execute sql: %s" % (err, sql))+ k" _; ?3 T! }) [# i( I
- db.rollback() k0 r8 Y2 |* u9 O
- def add_question(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date):) V2 O% e6 Z' X. m3 F0 W4 C
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
4 ?- r* T- ~3 m. V - try:
: V+ p9 |* j ~3 H+ l1 { - cursor = db.cursor()
3 X. ?* p# R) ? - #注意此处字符串的占位符要加双引号"%s"4 T4 A! s7 m) \$ d' U; ~
- sql = ' INSERT INTO BookConte (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date);& O( d3 r# l# X6 D
- print(sql)
; z. Q% Q: E$ C( d, f - cursor.execute(sql)
- ^! h' i& B2 o6 F) D7 f( X# | - print(cursor.lastrowid)
. x* f5 l: F& N( S0 v - db.commit()/ u2 g* f g. P _8 F
- except Exception as err:
" {" O8 V' P/ B - # except:2 K* U" p# B4 V# e( H2 w
- # print('Failed')
; t' F( K. a. }- {% q - print("Error %s for execute sql: %s" % (err, sql)); _. H3 L6 z+ u& U: X3 N- ?
- db.rollback()( d& w- `# D" y& Z1 M' N: {
-
" G. H* g2 y) N f5 J# N - def add_locoy(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover):
4 S9 Y9 W4 B$ W H8 P - reload(sys)- g3 N! h a e# ]7 A
- sys.setdefaultencoding("gbk")
w8 `& N, c3 G8 c" e3 \ - locoy_url = 'http://www.******.net/locoy/?my=book' #697火车头发接口地址& c. V4 h {* A
- locoy_data = {
5 t# m O4 I' e) @$ D - 'my_u':'用户名', #后台用户名% v$ Y% h- `) o; B0 D, p
- 'my_p':'密码', #后台密码( k0 h3 S. o; n/ B% R
- 'subject_669977_net':Bookname.encode('gbk', 'ignore'),
& b: [0 n% M( v# _ - 'caid':Cater_Name.encode('gbk', 'ignore'),
6 v1 D* n: k) a! e1 \ K; p5 l; R - 'title_669977_net':Booktitle.encode('gbk', 'ignore'),* s; e- X$ K$ _7 [6 m0 V: l3 E7 ^
- 'article':BookConte.encode('gbk', 'ignore'),* c/ \, }4 ^3 _0 F$ @% K6 t/ Q
- 'author':Book_author.encode('gbk', 'ignore'),
* y7 X: M( o7 c2 ] - 'ready_1':Book_Palabras.encode('gbk', 'ignore'),2 f+ o9 G, W) I% a7 O% _
- 'thumb':Book_img,$ s' t! X& N. U* x' C: Y z
- 'content':Book_Introduction.encode('gbk', 'ignore'),
5 a9 g6 b- J' X# n) L - 'abover':abover.encode('gbk', 'ignore') 4 U2 U# G$ s+ H! n% S H
- }
$ T9 r8 X# e$ u+ q0 F" M - res = requests.post(locoy_url, data=locoy_data)3 J8 M. u& z* _0 u5 S, W
- print res.text
8 @- \% d: q9 T& L$ v$ } - print res.content3 ~- j& t2 @2 u0 E0 b/ |6 |( ?
- # print Dsd
! x) N2 ^# a* J - return res0 N# i- d2 d0 [' x- `3 z* s
- 8 ~7 }, F9 n& o9 J0 T) X- {
- def __init__(self):
6 U' F S: v' _, d2 M* F - self.base_url1 = 'https://www.****.cc/'
" d: s3 u6 h8 W. N5 Z - self.base_url2 = '/'& K4 e2 O6 h' G/ @ ?! ?
- self.CaterId = [], I5 b9 @5 \& L: V, q, k# u# t5 \
- self.CaterIds = ['xuanhuan', 'wuxia', 'yanqing', 'lishi', 'kehuan', 'kongbu', 'nvsheng'], O0 B7 D4 @4 C: S* X6 P5 N
- self.page_num = 1) B, K/ g! A: a; x9 P, ]( s& J
- self.total_num = 200
, b7 M8 d! c1 v9 J" l - 8 C4 g G: j; M r) [! W
- @every(minutes=8 * 60)
* K$ i. d5 k& r - def on_start(self):- f% _" U! E) {/ U# m5 O8 L j
- global Cater_Name
6 b8 y1 _0 E! I; m; Y* @4 @' o( E; J! e - Cater_Name = []% |+ G4 M; g6 _5 d" s( K. r3 V
- while self.page_num <= self.total_num:
S5 c7 q+ ~+ P& j$ G4 k2 R: W- T5 W - for self.CaterId in self.CaterIds:
# I& k: }# p) x" U& v - if self.CaterId == 'xuanhuan':
, U$ e, n1 @# h - Cater_Name = '玄幻'
$ r; K- _; E0 ?+ N& `4 C2 x - if self.CaterId == 'wuxia':
) S3 V; s* X8 A - Cater_Name = '武侠'
* ~9 q+ j) |* B7 n0 L - if self.CaterId == 'lishi':
* l3 i9 Q$ r$ y* `# h" k - Cater_Name = '历史'
; z6 _/ e9 e7 |- d) q; I; k! n - if self.CaterId == 'yanqing':
; p; ?0 c! k3 W' d: a4 W - Cater_Name = '都市' ) J3 B* e! } C# S
- if self.CaterId == 'nvsheng':
# s+ d* y* U# v& ], J - Cater_Name = '都市' $ @3 l5 `+ Z5 f
- if self.CaterId == 'kehuan':
9 e$ W( S3 W/ Q! a" s - Cater_Name = '科幻'
X& _1 a0 }+ ` - if self.CaterId == 'kongbu':6 e6 b0 E2 l8 ~9 U/ W. n
- Cater_Name = '游戏' ( w. Y5 M2 z/ s
- print self.CaterId/ r2 {2 T( w; O. U1 {
- url = self.base_url1 + str(self.CaterId) + self.base_url2 + str(self.page_num) + "/" , Y# O6 j; R2 {7 S% |; O
- self.crawl(url, callback=self.list_Caterg,save=Cater_Name)* M( `- d$ v0 C' ]" m% n, ^
- self.page_num += 1
- g$ V; A7 j1 q8 o# a- M -
( H0 V. L* S+ Y p3 w- F - def list_Caterg(self, response):/ i* b! w) w; o7 k% w' _
- Cater_Name = response.save
9 w1 \: v+ a) H$ G - for each in response.doc('.pic-list a[href^="http"]').items():
8 @) n. B5 |8 _- y U( @9 L - self.crawl(each.attr.href, callback=self.list_Caterg_detail,save=Cater_Name)
# Z% r) C& h' S( P -
$ }/ C* Y8 O, B9 p, j) b- q - def list_Caterg_detail(self, response):
& @6 s9 {3 p5 _9 h C4 X& @1 [) [ - Cater_Name = response.save: p0 V; d7 O. _) g% E: s
- # print Cater_Name
% L1 f7 ]2 H% Q$ o) R - Bookname = response.doc('h1').text()! Q) G. e7 b" Y- X3 R1 {
- print Bookname m1 ?9 l. u0 U6 g3 E) A" ^& n8 J b
- Book_author = response.doc('.authorname > a').text()- X& P# M& J9 R! ?" R7 R
- # print Book_author& H1 N7 ?, Y4 l5 E" V# p0 I
- Book_Introduction = response.doc('.book-intro > div').text()7 `4 J) ]* V- Y: _) c: ?$ J
- # print Book_Introduction- F5 l, s+ [' \5 E/ V" f
- Book_Synopsis = response.doc('b').eq(1).text()" y$ S+ T$ {5 J+ }+ \
- # print Book_Synopsis1 t4 O8 y& N7 Z7 i; m
- Book_Palabras = response.doc('.booktitle p').text().split(' ')[1].split('|')[0]
7 ]3 y) s0 n" s, V- a - # print Book_Palabras, B' K* ~. y& p% V4 W
- BookIDs = response.url.split("xiaoshuo/")[-1].split("/")[0] #小说ID
5 d0 w! y- D$ W5 S8 {( m - # print BookIDs
" p' c& B; N& ~ - Book_Dates = str(datetime.datetime.now()) 6 n# H* h% { ^' d& p
- for imgs in response.doc('.bigpic > img[src^="http"]').items(): L. D# ]/ P( p" l8 k G, F) H! m7 G
- img = imgs.attr.src! q% X! `% N- V* J) }) |+ S* i a
- print img0 o: r5 G: A0 C' f& b$ G
- #小说封面下载6 a1 d0 Y' A( J
- extension = self.getExtension(img). C7 T4 b1 i9 ?3 }5 |
- name = self.getname(img)
" w* n; u- [! z" V7 S - file_name = name + "." + extension
1 n3 C! O5 G. q8 y4 L! L - imgDir = P_dir + name* X. C; p/ S& V! n* p5 {+ G7 G
- Locaimg = imgDir + "/" + file_name
8 B2 T; G) Y: p - print Locaimg$ F( N( f7 O: n4 w8 U& t y
- if(self.download(P_dir, imgDir, file_name, img)): #这2行可注译,图片下载到本地" P$ c! c, K$ {2 f4 S( v# R
- print('attachment url is ' + img) #% M4 P, q4 T4 Z
- Datos = {
8 _2 o4 r* U6 d' P - "Cater_Name":Cater_Name,4 y* i; X# h" u% s& ]
- "Book_author":Book_author,
' \# R9 [. K- M - "Book_Introduction":Book_Introduction,% F: G% m# s. L3 \/ G! n9 `
- "Book_Synopsis":Book_Synopsis,/ [0 @/ C& W% F( M. J* K
- "Book_Palabras":Book_Palabras,! q) f' c& z. q* M+ M$ R8 y7 r' I$ S7 D
- "img":img,% _* k/ G' Y8 N
- }$ R& H; J4 \( _% I; R( ?) [; ^
- self.add_BookFile(Bookname, BookIDs, img, Locaimg, Book_Dates) #这行可注译,数据库发布接口,方便其他系统的发布
, a! p3 s% n0 X+ L$ ~3 [ - for each in response.doc('div[class="bookbtn-txt"] a[class="catalogbtn"]').items():
) x9 R) v. j; } X) O - self.crawl(each.attr.href, callback=self.index_page,save=Datos)
$ f' B1 G- M% p* J0 }! i. N -
, O: j; N% g0 A) o# g/ x: B - @config(age=8 * 60 * 60) ; _( G# l+ i( e2 E; O% U) _1 Y
- def index_page(self, response):
z9 T+ }" ?5 O# X' S5 B - Datos = {7 ~' Z5 k3 B6 {
- "Cater_Name":response.save['Cater_Name'],) `# t2 `. M$ T0 O5 w* r
- "Book_author":response.save['Book_author'],9 B9 U) k: y. u
- "Book_Introduction":response.save['Book_Introduction'],
6 V1 o8 I7 `5 p V" T& E8 k. @ - "Book_Synopsis":response.save['Book_Synopsis'],% i! ]/ }1 R; v7 {4 }8 m0 x* d
- "Book_Palabras":response.save['Book_Palabras'],
; `5 X+ m( F& J0 q - "img":response.save['img'],
( b4 _8 v* M" }( { b, T% t - }
3 ]' ~4 N, w/ q5 ? - for each in response.doc('.chapter-list li:first-child a[href^="http"]').items():
6 n0 z3 u( M8 v0 P - # for each in response.doc('.chapter-list a[href^="http"]').items():
( a: s/ ]) D1 x# F4 D) k - self.crawl(each.attr.href, callback=self.detail_page,save=Datos)9 s' [; @; Q Q$ H8 ?
- @config(priority=2)& s) K, g; [& S9 m! s3 W5 N0 e
- @catch_status_code_error) Z8 a! ~3 a( M! _% B6 E; S, m
- def detail_page(self, response): ; M3 f. e7 p/ M5 e+ |
- NewRe1 = u'哈书'
3 Y" m/ }- a) c. G - NewRe2 = u'huhjsd.CC'" a& k& a, ^8 \; `
- NewRe3 = r'^\\n\\n', |; D* @1 k. f6 Q( z
- NewRe5 = u'小说网'0 ~- U& t8 p* p Q' S
- NewRe6 = u'fgdfgf'( M2 u5 J: N4 P0 D# h( ?& e( N
- NewRe7 = u'fgfgf'
7 R% v& i j( n" | - NewRe8 = u'ffhgf'
: k* u+ z% `6 ~' _$ a - NewRe4 = r'[\f\t\v+\.\{\(\)\}\!\/_,$%^*(+"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+'$ @. B: {& k8 b& E& R1 p$ m6 }
- ReC1 = u'静思'1 ^" I6 s2 O. J
- ReC2 = u'aghgf.com'9 J6 u5 U0 s4 B$ a- i2 W
- ReC3 = u'aghgfh.com'( ]1 C ~8 L) n: _: H! S
- ReC4 = u''
. a/ c$ j7 O" S$ j i - ReC5 = u'文学网', z9 B- E4 K$ ?" ~
- ReC6 = r'<BR>') [+ B, f3 g' L0 w% `
- Bookname = response.doc('.readlocation a').eq(2).text() #小说名称0 Y+ s+ f1 Y0 Y9 x: S) x
- print Bookname% h& i9 E$ X: }8 y7 ?4 d1 A
- Cater_Name = response.save['Cater_Name'] # 小说分类
) `& N8 z5 R4 U+ q- [ - Book_author = response.save['Book_author'] #小说作者
8 a& Y5 ^6 R& i - Book_Introduction1 = response.save['Book_Introduction'] #小说简介+ \4 P @' @: \5 U7 G" `. w. H
- Book_Synopsis = response.save['Book_Synopsis'] #最近更新2 H6 K. s6 e! [8 y& x* _! Z, N$ ~
- Book_Palabras = response.save['Book_Palabras'] #小说字数% h R% M' H, } d8 V9 g2 I, f* W
- Bookurl = response.url #小说网址
) u6 h* L6 c# P W5 n1 z5 ] - Booktitle = response.doc('.article-title').text() #章节名称
0 E, f! C+ G: d) f - BookID = response.doc('.readset-r span').text() #小说ID
/ v$ @& g) N/ c) y7 V4 Z# c$ f* F - BookConte1 = response.doc('.article-con').text() #小说章节内容
- N% }: g* A/ X; p+ @, w3 B5 y! @ - abover = response.doc('.article-title').text() + response.save['Book_Synopsis'] + response.save['Book_Palabras'] + response.save['Book_Introduction'] #小说状态(连载还是完成)
$ n% l# T; A" V - Book_Date = str(datetime.datetime.now()) # 采集时间7 ?3 Q7 c& p9 N) B; w
- BookConte2 = BookConte1.replace(NewRe1 , ReC1)0 O/ y& D8 k9 k7 O& D
- BookConte3 = BookConte2.replace(NewRe2 , ReC2)) P/ Z; K% B) n/ ^6 p/ j) C
- BookConte5 = BookConte3.replace(NewRe5 , ReC5)
4 O- M; o+ Z0 N4 J) J - BookConte6 = BookConte5.replace(NewRe6 , ReC2); L2 d2 I3 E6 V# v# q4 T
- BookConte7 = BookConte6.replace(NewRe7 , ReC2)! O% ^5 G' _' k+ T6 a% {/ g K @
- BookConte8 = BookConte7.replace(NewRe3 , ReC6)
$ G5 a, j% O- y! G. l7 M - BookConte4 = re.sub(NewRe4 , ReC4 , BookConte8)
: O9 [1 r! @/ A - BookConte = BookConte4.replace("\n\n","<br>"); [, \/ [" M! e+ A3 K
- print BookConte% a" [6 P/ c& O4 u
- Book_Introduction2 = Book_Introduction1.replace(NewRe1 , ReC1)$ F4 T0 @* n5 s& F' Y( I
- Book_Introduction3 = Book_Introduction2.replace(NewRe2 , ReC2)
' T5 Q& p8 ^$ F - Book_Introduction4 = Book_Introduction3.replace(NewRe3 , ReC3)8 C8 y- Y7 M/ C- m3 n
- Book_Introduction = re.sub(NewRe4 , ReC4 , Book_Introduction4) G& v8 l$ O& ^& N; `) c
- Titleid = response.url.split(BookID + "/")[-1].split("/")[0]
' i* J* h' b6 y1 s5 s; o5 }8 ^, @ - Book_img = response.save['img'], #小说图片
3 u9 {! a2 p! V+ w( _ -
8 `7 r: A6 m% V. C2 z, Y/ U - #insert into MySQL 小说入库
& b' Y( b$ C+ x0 Y - self.add_question(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布
) Z6 f; P9 Z0 B1 b( S6 m/ T, J - self.add_comment(Bookname, Booktitle, BookID, Titleid, Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布
6 X: V _. L8 b* D I; y$ n - #post提交发布. W4 ^) h7 `. m% ?) K6 a% E: q. w, W* r
- self.add_locoy(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover) #这行可注译,火车头发布接口,不需要可取消3 Q9 U6 o4 o1 ]: r" f
- Datos = {7 s$ _3 M! y9 W! n
- "Cater_Name":response.save['Cater_Name'],5 \8 D1 o8 }. i/ B+ A
- "Book_author":response.save['Book_author'],1 {4 _ [* r" @' w# T: w
- "Book_Introduction":response.save['Book_Introduction'],9 o6 d P- ^( Q. K9 k
- "Book_Synopsis":response.save['Book_Synopsis'],
' Y6 l5 I4 ]& H7 X" r - "Book_Palabras":response.save['Book_Palabras'],
4 J7 ]$ N; C# q* t2 D - "img":response.save['img'],
4 ^+ Q$ |# K4 P% }3 g# | - }
( ]! n/ e% a) N0 n. x - for each in response.doc('.articlebtn > a:nth-child(4)[href*="/xiaoshuo"]').items():, r! E9 M# o! K8 }8 {# D
- self.crawl(each.attr.href, callback=self.detail_page,save=Datos) ( j/ E% |0 I) \& e \3 d1 e, }6 P
- return {% S3 b2 D9 O- B5 o- U4 K8 B
- "Cater_Name":Cater_Name,2 J3 v5 k$ v2 \5 x) ?
- "Bookname":Bookname," o; e5 s+ v* q, w u* p6 i
- "Book_author":Book_author,/ X4 q7 ]( y6 s' O4 o
- "Book_Introduction":Book_Introduction,
1 E: P% e) d; b1 E& h' Y - "Book_Synopsis":Book_Synopsis,
, {) c9 w2 x9 l3 j8 T - "Book_Palabras":Book_Palabras,! ` g: k( }+ E) U R
- "Book_img":Book_img,
$ @# ^: L4 c' A b - "Bookurl": response.url,
x. }" q- j! p% Y - "Booktitle": Booktitle,* r" a# A1 U( F
- "BookID": BookID,
7 d2 ^! z8 {& Y) _ - "BookConte": BookConte,. @/ }7 m9 Z2 O# F
- "Titleid": Titleid,
9 a( S/ M) F! v - "abover":abover,0 h: n, H& ^) S- g
- # "Book_Date" = str(datetime.datetime.now()),' H0 Q- J4 ]% O) C
- }
7 I4 d; l. x) w! T! u( n - def download(self, P_dir, imgDir, file_name, Book_img):7 t1 x7 N/ T7 B' Q' j' C
- if not os.path.exists(imgDir): $ B f" E8 D# y: S
- os.makedirs(imgDir)1 S0 t" ~3 a' t# t
- file = imgDir + "/" + file_name; Q7 T7 H [5 t! |" j
- # print file- L% g4 w/ c* ^* k
- f = open(file, 'wb+')
! s+ p' k% z) A - imag = requests.get(Book_img) $ @. \* j* u! f9 b* l
- f.write(imag.content)5 F R. j N1 U/ W6 j
- f.close()
5 }3 E! E; E5 ~$ z, \ - #保存图片前 X1 J: F# V% l8 ]' Y
- def save_imgs(self,response):
4 E) L; o8 b% s& Z2 A, J" _5 H - content = response.content
( \8 u ~; S7 _, H, s0 W8 p - file_name = response.save["file_name"]
7 ? z! C* ^: \- j" S1 M8 b - imgDir = response.save["imgDir"]: Y) n# f3 m: x2 X
- file_path = imgDir + file_name8 s1 `: P9 a4 M4 K
- self.save_img(content,imgDir,file_path)& x; z, {% K' t* Y
- #保存图片
* D# J8 ~& Z% O$ f/ J - def save_img(self,content,imgDir,path):
! r* D& k: _9 D3 a1 f - if not os.path.exists(imgDir):
9 y6 C: T. ~- z' ~& { - os.makedirs(imgDir)' s6 o$ I- T$ u% h- ^8 u* }
- f = open(path,"wb" ). W$ ^" J {5 ^
- f.write(content)! c, H6 Q2 U8 q9 s
- f.close(); K" `3 C( e5 c9 d2 [
- #获取url后缀名
6 d! ]1 `3 p; k7 u- | - def getExtension(self,url): # ~! _; T- B. n) J U
- extension = url.split(".")[-1]1 f. m& ~0 @) S2 p
- return extension
+ [. ? b( g" ]. J3 X T; ?# O/ N& q - 6 X( {) G6 E& d W
- #获取图片名6 {- c; K7 m# h% X# g5 n
- def getname(self,url):" ]2 c6 R' g: Q/ A H `3 N( J
- name=url.split("/")[-1].split(".")[0]: \1 e% _: Q7 u" \! b; s( Z- P8 f' ?& k
- return name
复制代码
' p! I& Z: `: u, w1 j# m6 S# ] + r; l9 [4 e1 \/ K
|