Python + pyspider某小说站的爬虫,入数据库,火车头发布,资源下载到本地,另可写爬虫!
9 q( M: @- b% |3 I# |' ~, Y- #!/usr/bin/env python
: _& P' X8 f1 `5 a - # -*- encoding: utf-8 -*-
$ E5 \! `. f9 C& P: A - # Created on 2019-05-05 21:43:11! v0 Q' Y. _. i; r; u# W) T/ k
- # Project: XiaoShuo! h3 k1 ` U: v2 c
-
# Z2 V/ T$ w) G- h: r- x - from pyspider.libs.base_handler import */ q! I4 j o) A" a. E
- import pymysql
1 f7 e: G- C5 W# ~ - import random* |& w9 d# v( e* t
- import datetime2 i1 |* C- \' j) S1 `# S" P4 k
- import urllib2,HTMLParser,re' R, y% ?2 g$ c0 G. A
- import os6 |0 L% W( k- W) k6 {# u7 P# x
- import sys4 u- {% ?0 Y( w3 n+ ] R
- import re% C1 W7 x6 q1 h( p; E
- import codecs( y7 Q0 m; ?0 v% N5 u/ }
- import requests6 D$ Q6 f1 x3 J
- import json4 k P8 m. C* r' m
-
6 X; b* |% x/ l/ W) c, E' ~; S! G - class Handler(BaseHandler):! c$ i3 \; q: q3 \0 x, P; L
- global Datos
% F8 a. w5 N5 d0 W6 T - global P_dir & c8 T9 c3 Y% C$ N4 q# ^( Z) Z
- P_dir = '/Tools/Debug/' #采集时候图片保持到本地的路径) L9 |0 X5 `' C9 E3 Z
- global Datos2 w- R7 ^% v% q1 f. _& L
- Datos = {}, s& _! |, j" ?: J7 D7 s
- headers= {
0 S- [# Q! d( J N: K8 P - 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
# o3 n5 ^9 P) x" |3 L0 O - 'Accept-Encoding':'gzip, deflate, sdch',2 ?% m8 X: `% S6 c: x$ u5 I
- 'Accept-Language':'zh-CN,zh;q=0.8',
- b4 Q. \2 ~+ U% G - 'Cache-Control':'max-age=0',
- _9 c8 U. }4 j* ~/ i, W' @ - 'Connection':'keep-alive',
* r2 F) h2 }/ l9 e6 E4 w# j# B - 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'* ]$ B4 w* M3 Q, @
- }
6 e" {; W1 a, I* X; \5 j( B# H - crawl_config = {
8 i! b0 {2 m5 ]. H - 'headers' : headers,
3 T; G6 e3 p1 X' i) d9 D - 'timeout' : 3008 s5 d5 O; e* @6 c3 ~, a
- }" y3 e! y; H' C# Z) l
- def add_BookFile(self,Bookname, BookIDs, img, Locaimg, Book_Dates):7 r4 l# {$ g, X c$ W' z
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8"): W' f# K, A# a
- try:
+ ?, u7 ~5 z3 G2 F - cursor = db.cursor()
% C5 q4 Q0 |* ]4 {: E - #注意此处字符串的占位符要加双引号"%s"# P) B C5 S8 ?3 L
- sql = 'insert into BookFile(Bookname, BookID, img, Locaimg, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, BookIDs, img, Locaimg, Book_Dates);+ n: I/ o' b( D/ J9 F
- # print(sql)
$ \* Q7 ^' Q+ U* }9 R1 N4 X4 y5 N/ J - cursor.execute(sql)
5 a E7 W6 ]/ e: z, Y/ R3 M4 i3 Z/ i. f -
0 V4 }6 k% }% S, p( u7 a - #qid = cursor.lastrowid
' S# t+ X; y1 t/ h1 {% p - #print(qid)
* j% {. Z) M0 V$ `7 P -
) W/ c) o- }' y; J* A! T9 I8 ]! s7 k7 s - db.commit() v& D& R9 u+ u5 C) l% r0 P
- except Exception as err:: c! ?9 Q* u3 q( R, I* |& E
- print("Error %s for execute sql: %s" % (err, sql))
: l" j( X* S4 X: y+ ^2 u" w6 Q - db.rollback()
% U0 ^# i8 P, D! E9 U% x) Y8 b; _# k - def add_comment(self,Bookname, Booktitle, BookID, Titleid, Book_Date):
/ o5 {$ w O( g1 H- \ - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")" A# C+ P0 Q* q+ _6 Q& ^% D
- try:9 v! H$ B! Z4 i: g
- cursor = db.cursor()
) f3 V8 A4 a! S! t - #注意此处字符串的占位符要加双引号"%s"
! r6 w8 J" F4 y6 `9 p& r - sql = 'insert into BookTitle(Bookname, Booktitle, BookID, Titleid, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, Booktitle, BookID, Titleid, Book_Date);
# \$ |4 Z& p1 n6 E8 S - # print(sql), p3 w( y2 G& s/ u1 P
- cursor.execute(sql)
# h# T5 o J ? I7 N8 v/ C -
& f1 s0 w, P3 U/ Q; v - #qid = cursor.lastrowid
% s/ U/ s4 f6 g# a0 o5 S - #print(qid)
# c5 j! R, h: u5 Q - 1 ~1 z7 z4 L1 W# n6 f
- db.commit()
- a. [. F4 Z) g: r! u7 c - except Exception as err:- F% d# [4 ]3 F( M% U
- print("Error %s for execute sql: %s" % (err, sql))1 g; c! }" f u7 w
- db.rollback()
" A* G" P, J/ H. n! J a+ [ - def add_question(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date):( Q+ `0 |7 F. `' `$ e( o
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8"): \9 W& l& [$ H; P x* @
- try:
9 d. P* D1 |9 C. h L - cursor = db.cursor()
( M/ m6 [* h- p8 {2 I- Q - #注意此处字符串的占位符要加双引号"%s"+ e4 g2 r7 `, N) D" @8 C: k) l' @
- sql = ' INSERT INTO BookConte (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date);
: J: x, r# ~+ m. T" N& S) J - print(sql)
0 a( u7 f. F% T0 p - cursor.execute(sql)
6 g9 i; T9 u: o5 B7 q- E - print(cursor.lastrowid)& l9 c: d } ]
- db.commit()
* }3 S1 ^. L+ R( Y% h( f - except Exception as err:
2 W3 ^1 b) c4 J( r! s8 N - # except:) W3 w, A" s; l/ Y
- # print('Failed')$ o6 B) f5 _# ?
- print("Error %s for execute sql: %s" % (err, sql)), M6 c/ V8 a5 K5 e' t
- db.rollback()4 r6 V% N/ O' P* g: |/ g* Z+ z, O8 Q
-
' I4 t6 n& A* |, ~" u4 {6 B- ] - def add_locoy(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover):
/ \: [( h, v% m1 C- R9 _/ v - reload(sys)
+ m. U# S: A% C1 ^- f - sys.setdefaultencoding("gbk")/ ?% b& P: A6 @9 O% X! ~; O0 U
- locoy_url = 'http://www.******.net/locoy/?my=book' #697火车头发接口地址) \; s$ h* F d |/ `
- locoy_data = {
! I a6 n, v/ s, W3 a& c- h( n - 'my_u':'用户名', #后台用户名
6 B3 J! f2 U( S4 n3 r; H - 'my_p':'密码', #后台密码
$ Q) s! ^. `$ |# h9 e8 ?% | - 'subject_669977_net':Bookname.encode('gbk', 'ignore'),9 j0 X" r. o1 K* D' q1 h* G
- 'caid':Cater_Name.encode('gbk', 'ignore'),# y' V# c" i( b" d
- 'title_669977_net':Booktitle.encode('gbk', 'ignore'),
0 e z9 l- |% B( H" X$ q" K - 'article':BookConte.encode('gbk', 'ignore'),
3 U; y* ]6 @3 O: J - 'author':Book_author.encode('gbk', 'ignore'),
( a' l& r1 d* R7 B - 'ready_1':Book_Palabras.encode('gbk', 'ignore'),
6 t* m3 S3 D# F* g- r* l - 'thumb':Book_img,! f1 B! _# V1 J7 `' m# S
- 'content':Book_Introduction.encode('gbk', 'ignore'),
. e: P5 D) @- D. g( `" [1 s - 'abover':abover.encode('gbk', 'ignore') % |8 U9 x8 W8 m8 I" i9 Q
- }
* V! ?% V- }/ o7 S - res = requests.post(locoy_url, data=locoy_data)
# k8 A" L( F, \$ a) p2 @" I - print res.text+ L# }, w. ^- S+ l
- print res.content
1 G. T7 Z) G4 e0 s* d& a - # print Dsd- K" Y& s. J3 U1 i, ?' |
- return res
+ H% n; H$ S, e* H/ Y - ' j! M' E i, W* l1 X9 T- a7 R( {
- def __init__(self):
) a1 S N, @( C C# H) F6 [ - self.base_url1 = 'https://www.****.cc/'
# C) U" K3 M2 [% @* x' t3 F - self.base_url2 = '/'+ d7 _2 V/ V5 L4 q6 ~& T
- self.CaterId = []
% V/ u" s& e$ b$ s6 g - self.CaterIds = ['xuanhuan', 'wuxia', 'yanqing', 'lishi', 'kehuan', 'kongbu', 'nvsheng']
8 H; c3 k1 t0 Z* T; G, G" a - self.page_num = 1
5 n, u+ _* e8 E0 u9 s4 C9 W% {! y - self.total_num = 200
+ e9 X4 Y5 C/ d$ j, m -
: D$ S* a% x; s% B L" u. C - @every(minutes=8 * 60)3 |* c7 b% N( ]* u1 q8 i P2 P( `6 ]
- def on_start(self):; `% X3 u) Y3 {' P
- global Cater_Name# s- m7 ]; B3 |; z2 [. n* ~) y
- Cater_Name = []& }; Y3 J1 M2 F
- while self.page_num <= self.total_num:
; n+ G5 t, ^1 U3 j7 }( L - for self.CaterId in self.CaterIds:
8 W: p+ R( ~2 }, E - if self.CaterId == 'xuanhuan':2 c: P3 o+ J) i
- Cater_Name = '玄幻'! O8 m; @" d4 k) z) Z8 N9 y
- if self.CaterId == 'wuxia':2 W" {# E# H' s A% ~% l) [
- Cater_Name = '武侠'
' j: f, J; |! W: s# _4 i3 r - if self.CaterId == 'lishi':
& D* i# V D6 v5 u9 z# M - Cater_Name = '历史'
% r& o% Q) Y9 ]6 w: f3 R6 A1 i4 t - if self.CaterId == 'yanqing':7 X ]' e+ k- ^; X% ~9 E7 U- N
- Cater_Name = '都市'
" L8 x' {, N& ? - if self.CaterId == 'nvsheng':
" W' m; q7 E N - Cater_Name = '都市' " g. v* v( _* R- r
- if self.CaterId == 'kehuan':
# o- l; i" z- W9 e! B - Cater_Name = '科幻' + f5 \5 u9 p) m, D6 I
- if self.CaterId == 'kongbu':; X! M5 n/ o% v+ e* z
- Cater_Name = '游戏' " a' i9 r2 {+ g/ ]) c6 M
- print self.CaterId3 I3 n3 v9 V8 i6 l/ L4 b1 y1 z4 }: q
- url = self.base_url1 + str(self.CaterId) + self.base_url2 + str(self.page_num) + "/" 8 N, [5 }& ^; X: k1 Q) O# |2 v
- self.crawl(url, callback=self.list_Caterg,save=Cater_Name)
; s K' z* u: B" p4 ^5 _0 ] - self.page_num += 1
7 l% b0 u; Q) H7 k2 X7 _, q2 K - : N. F/ S: Y, m W% H! j
- def list_Caterg(self, response):2 B, t2 H. m! s8 g- L7 S- |! V
- Cater_Name = response.save( \1 A) A# J* k
- for each in response.doc('.pic-list a[href^="http"]').items():1 I) w4 x4 V: n
- self.crawl(each.attr.href, callback=self.list_Caterg_detail,save=Cater_Name)
/ X9 D3 Q. A5 d, N1 p9 X d& z - & [" `- I, V1 R2 e9 K" O
- def list_Caterg_detail(self, response):
( q6 _" G/ v. K - Cater_Name = response.save9 U% y4 v# V2 m! V, D& v S) D
- # print Cater_Name
, [* P; g9 s( u s* I$ R - Bookname = response.doc('h1').text()' L4 |3 n% ~) `1 o4 U4 d
- print Bookname
1 R8 z: y: c8 L3 D8 q/ G4 ^! w2 j$ Y - Book_author = response.doc('.authorname > a').text()
0 J) j# _0 }. b$ `! E. v4 D+ f8 c8 ]2 R - # print Book_author
( w+ B/ Y: }& g - Book_Introduction = response.doc('.book-intro > div').text()
! q( O4 v. v4 W - # print Book_Introduction
' V3 I, {- d' {3 ^) i3 h - Book_Synopsis = response.doc('b').eq(1).text()& m$ G3 ^$ i+ y( [
- # print Book_Synopsis
$ Q/ b# R) V0 b) r - Book_Palabras = response.doc('.booktitle p').text().split(' ')[1].split('|')[0]7 Q8 @0 o5 y8 b) ]6 a
- # print Book_Palabras3 c2 F2 I# \; Z3 Z& V% I/ B
- BookIDs = response.url.split("xiaoshuo/")[-1].split("/")[0] #小说ID
/ H# C% v/ Z* T" g" K b - # print BookIDs5 c. X2 [+ C* a7 h2 T6 K
- Book_Dates = str(datetime.datetime.now())
- _! U/ b" d! ]# u6 H, ^ - for imgs in response.doc('.bigpic > img[src^="http"]').items():
' `8 T4 [; j* B - img = imgs.attr.src
( U; R% s2 S* `. H( _( w - print img4 ^0 x: a7 J- H. j+ t, J
- #小说封面下载/ S+ c4 U% E2 N2 Z% z( e
- extension = self.getExtension(img)
/ t' }/ U2 i0 q# U/ n - name = self.getname(img)2 N% N" {, U8 e( g
- file_name = name + "." + extension
/ S* E* U6 q: w& Z - imgDir = P_dir + name
2 S: J* q) f4 G3 U! C- {- X - Locaimg = imgDir + "/" + file_name1 l R5 |5 {: V4 M: k; s6 }
- print Locaimg
6 A0 I1 T2 T6 B3 R - if(self.download(P_dir, imgDir, file_name, img)): #这2行可注译,图片下载到本地
( n. q0 r+ e0 [, U3 t9 e2 c - print('attachment url is ' + img) #7 I3 T. n0 I& a" o& B: a ]
- Datos = {
! T0 W& I, U5 K! c7 G4 u. R. T - "Cater_Name":Cater_Name,$ ^$ ~) @5 ^ o6 @; j+ x
- "Book_author":Book_author,
. X' L* K+ x# D - "Book_Introduction":Book_Introduction,
2 K9 z4 R! @: T8 e0 g: {; T9 ?8 [ - "Book_Synopsis":Book_Synopsis,
R6 c$ g! |0 q% x4 f - "Book_Palabras":Book_Palabras,% S" m {- H1 V1 j
- "img":img,; @: r J+ G3 w, S, [; y( T
- }% o2 L- j1 e! [# |2 s- N
- self.add_BookFile(Bookname, BookIDs, img, Locaimg, Book_Dates) #这行可注译,数据库发布接口,方便其他系统的发布
) {; L4 ]' T' b, A% y - for each in response.doc('div[class="bookbtn-txt"] a[class="catalogbtn"]').items():
0 @1 e( l8 _% W - self.crawl(each.attr.href, callback=self.index_page,save=Datos)
' \& l/ X! q" Q: H1 E! E - . E' T$ Q) `) g. R
- @config(age=8 * 60 * 60) 3 d5 L% S4 S' d: y: m0 A$ S
- def index_page(self, response):
) C1 ]) K3 j; y1 i) e7 S - Datos = {% N( }6 b" D) Q* |7 n
- "Cater_Name":response.save['Cater_Name'],
7 a8 K- A+ k4 [$ v - "Book_author":response.save['Book_author'],+ O4 F4 @* s; Q' j2 n" z) ~1 T; \
- "Book_Introduction":response.save['Book_Introduction'],
2 |! m, _- l7 Z9 a - "Book_Synopsis":response.save['Book_Synopsis'],9 Y/ ?2 r8 ?5 G& O6 w$ k! V3 ^: d
- "Book_Palabras":response.save['Book_Palabras'],
# l6 B! j; R. `( b* S, `* j - "img":response.save['img'],
' D# n( _3 m+ D3 W' m0 w% t - }
4 l* p- l/ o0 x, i5 v$ O% Y% u - for each in response.doc('.chapter-list li:first-child a[href^="http"]').items():
; {4 y S5 ^/ z x8 j" D2 n - # for each in response.doc('.chapter-list a[href^="http"]').items(): 4 S K, c/ a0 [
- self.crawl(each.attr.href, callback=self.detail_page,save=Datos); r9 x; l. j3 P6 P8 i' L
- @config(priority=2)/ q% v; ]$ r& J, J
- @catch_status_code_error$ G( _& m+ G4 c/ e+ X0 b: W
- def detail_page(self, response): 0 B3 Q! h7 z* s2 V# H, i
- NewRe1 = u'哈书'
o( _5 h0 A' P5 Q0 Q9 o - NewRe2 = u'huhjsd.CC'3 d) [+ Q; ~& x1 B" q
- NewRe3 = r'^\\n\\n'
# P: F U% O1 u5 C - NewRe5 = u'小说网'
- H( W/ c7 u3 {* @3 Y+ |: |- M4 ] - NewRe6 = u'fgdfgf'% r) r& b3 C5 p( a( S# z
- NewRe7 = u'fgfgf'4 E6 C) ?2 Q4 b& [( M* e) W
- NewRe8 = u'ffhgf'
' J2 v5 K# h0 |9 ~) B7 F$ K3 q - NewRe4 = r'[\f\t\v+\.\{\(\)\}\!\/_,$%^*(+"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+'
' n3 t3 w, ^# U, C' D7 ] - ReC1 = u'静思'
2 J2 G$ G2 L3 @7 R! F - ReC2 = u'aghgf.com'
/ y% {1 |2 N- N1 M0 B5 k; M - ReC3 = u'aghgfh.com'
) `# c# {- M7 U4 C, j s2 W2 q4 k - ReC4 = u''
& Z3 ]6 e3 |0 j9 m - ReC5 = u'文学网'& A) T V+ _% y5 @
- ReC6 = r'<BR>'0 {/ G/ g& x& B4 X$ c' q- Z2 J
- Bookname = response.doc('.readlocation a').eq(2).text() #小说名称: k7 a! h& v: a6 H \4 \
- print Bookname
; K W* ^5 A6 [* E* h - Cater_Name = response.save['Cater_Name'] # 小说分类" ^8 P9 M, k; B5 r
- Book_author = response.save['Book_author'] #小说作者% Q6 ?8 g o5 v% g
- Book_Introduction1 = response.save['Book_Introduction'] #小说简介! j* i0 i( |- Z. `
- Book_Synopsis = response.save['Book_Synopsis'] #最近更新
; ]" A0 S1 ~3 Y* |4 T0 w - Book_Palabras = response.save['Book_Palabras'] #小说字数
7 t" c l1 z( w* m6 |* j - Bookurl = response.url #小说网址3 s0 k) O# t/ p: ]" l
- Booktitle = response.doc('.article-title').text() #章节名称
: Y4 a) n6 g' A) Y& p) @ - BookID = response.doc('.readset-r span').text() #小说ID
3 b. u5 T' _" `7 d' {' o - BookConte1 = response.doc('.article-con').text() #小说章节内容8 @ ]0 l$ D f2 M2 U; ` x3 W
- abover = response.doc('.article-title').text() + response.save['Book_Synopsis'] + response.save['Book_Palabras'] + response.save['Book_Introduction'] #小说状态(连载还是完成)
5 p$ B8 L/ t0 B7 F - Book_Date = str(datetime.datetime.now()) # 采集时间
- k& I/ [+ `( n2 k# N - BookConte2 = BookConte1.replace(NewRe1 , ReC1)& ^ B& h% V) @+ l! H; v1 f5 E
- BookConte3 = BookConte2.replace(NewRe2 , ReC2)
# J) F# h3 a7 r7 F9 w4 I7 W" u! _ - BookConte5 = BookConte3.replace(NewRe5 , ReC5); R$ w( r( O: }5 W/ H
- BookConte6 = BookConte5.replace(NewRe6 , ReC2)
) t; J6 D) b! l6 u% t, G5 A, h - BookConte7 = BookConte6.replace(NewRe7 , ReC2)# J7 w1 a# D& \. X N9 }
- BookConte8 = BookConte7.replace(NewRe3 , ReC6)/ i8 a$ B: Q0 w8 q
- BookConte4 = re.sub(NewRe4 , ReC4 , BookConte8)- t- N2 W% Z2 Y1 d3 f
- BookConte = BookConte4.replace("\n\n","<br>")
2 m+ C; _- M0 ^4 j& {; l, B - print BookConte
& ]2 f( L2 r' o, O5 y0 ]9 J2 d - Book_Introduction2 = Book_Introduction1.replace(NewRe1 , ReC1)
7 W8 ]8 a$ N' {& F& y% O - Book_Introduction3 = Book_Introduction2.replace(NewRe2 , ReC2)
' o0 m2 w/ \8 q) f/ x- R' O - Book_Introduction4 = Book_Introduction3.replace(NewRe3 , ReC3)+ z, l( E$ H& i( m2 F; C
- Book_Introduction = re.sub(NewRe4 , ReC4 , Book_Introduction4)7 V: V% a/ k5 t1 J1 E) e
- Titleid = response.url.split(BookID + "/")[-1].split("/")[0]
* b& l+ }8 k3 P& |% X/ {: Q2 t - Book_img = response.save['img'], #小说图片
+ ]: H9 j0 x( ~3 N8 A4 D/ H -
' e5 m/ L t. A1 J2 ~ - #insert into MySQL 小说入库
( L+ h. u: A8 P& x) S( l! M; F8 @ - self.add_question(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布9 k+ L; Z F D& K. v9 [6 r
- self.add_comment(Bookname, Booktitle, BookID, Titleid, Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布) w* c0 E7 w2 e& @& ]# T
- #post提交发布
, i8 V- V$ I4 G) A0 d - self.add_locoy(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover) #这行可注译,火车头发布接口,不需要可取消
! E1 ?/ D- s3 P% ~4 C9 n6 B# L - Datos = {8 F- B$ S9 `% @" _' j: ~9 t
- "Cater_Name":response.save['Cater_Name'],
- G6 S1 q1 [3 w: g - "Book_author":response.save['Book_author'],
& _1 T5 E. W6 i# R) E - "Book_Introduction":response.save['Book_Introduction'],) |2 l# M; ~9 R k
- "Book_Synopsis":response.save['Book_Synopsis'],
; \, V; e) d8 f! _1 y - "Book_Palabras":response.save['Book_Palabras'],
0 \( A6 G9 h4 W+ j7 R; Q - "img":response.save['img'],) d; i0 @; r! p, ]- i
- }
& ~# g/ S# N. e1 v9 n3 n, V) u - for each in response.doc('.articlebtn > a:nth-child(4)[href*="/xiaoshuo"]').items():8 H; j" J q! S
- self.crawl(each.attr.href, callback=self.detail_page,save=Datos) * L% _' K2 z- [! S6 i m% d" s; I
- return {3 d/ y. w/ v c! ]! w: q" N0 ~
- "Cater_Name":Cater_Name,
6 b! V% n. u; E$ @/ b: B7 s - "Bookname":Bookname,
9 o3 x: n) I4 ?* k S9 W( j ?4 N" d& h - "Book_author":Book_author,
1 e3 \ H- G3 [$ a/ [. ~ - "Book_Introduction":Book_Introduction,
. \8 l) P5 r! s+ d9 T' E - "Book_Synopsis":Book_Synopsis,
X7 S0 L# Y8 y# _7 I - "Book_Palabras":Book_Palabras,3 ]" \+ e- j; V& [* p: s$ d7 }9 \
- "Book_img":Book_img,! B& H5 h4 X0 }, ~4 V0 l
- "Bookurl": response.url,
" ^5 K3 H/ E8 d4 i$ M2 t/ s - "Booktitle": Booktitle,
9 u B m1 f5 e4 n& h+ R% z - "BookID": BookID,
. [& d! Y0 c% ^+ J' i$ H - "BookConte": BookConte,
7 u1 G6 G+ ?/ k7 G8 Q: M+ ~0 ? - "Titleid": Titleid,
0 s& _4 V- t& k* C5 Y - "abover":abover,: c) ?0 `$ U8 F/ V8 q
- # "Book_Date" = str(datetime.datetime.now()),
; c5 \- m( ]2 l" u8 d% a2 t! o" U3 o - }0 f1 v' _; ~& F0 t) z
- def download(self, P_dir, imgDir, file_name, Book_img):: {. N' N- d, ]0 y6 s$ [: Y/ O
- if not os.path.exists(imgDir):
. ?$ u' e9 p5 k0 O - os.makedirs(imgDir)& J0 {( `! _. P8 u; k7 T& D
- file = imgDir + "/" + file_name
0 T ?; n, O; _. W0 h8 B) ~ - # print file3 j/ t$ r; U0 p9 I. D
- f = open(file, 'wb+')
5 k; u, }* J2 n7 G - imag = requests.get(Book_img) 0 f- r. ^# D% F
- f.write(imag.content)
* x3 @2 e0 G$ C q# | - f.close()
+ U0 U* ~6 A1 V( ~" D - #保存图片前
7 \& e' ^( [; Q2 @2 @4 @% p - def save_imgs(self,response):
! \7 p$ \+ p+ v- d) M - content = response.content; p7 |9 Z. K4 J% D) o" n/ o' j7 ~
- file_name = response.save["file_name"]
: g0 e9 t- @2 ]6 d k6 I% e* F - imgDir = response.save["imgDir"]# ? W4 [+ Z' l0 d
- file_path = imgDir + file_name8 Y9 C# P, h5 j! g% `5 S! p
- self.save_img(content,imgDir,file_path)
4 X6 p {5 E1 K! ]+ E+ Y% r/ _8 O - #保存图片9 K1 o9 ?! b0 Z9 `
- def save_img(self,content,imgDir,path):, B$ l- L9 u/ ~3 ~# c' s4 x
- if not os.path.exists(imgDir): " P! n* y& K5 E" q8 w7 G
- os.makedirs(imgDir)
. W9 n! r: ]' M7 o - f = open(path,"wb" )6 n- C: N) A3 K4 {
- f.write(content)9 @1 Q/ r/ |, u: t8 k$ Q
- f.close()
* [- d; u$ z" p, ^: V8 p# h! z - #获取url后缀名% a6 T# g" K6 f* f" \
- def getExtension(self,url):
: r- E" @1 k& x8 F/ t7 O - extension = url.split(".")[-1]. \" M. g- y, f& K: W
- return extension s; V" c7 ~/ b6 }4 s& O. M# t0 b. F
-
8 u8 P2 O0 ]' a: P- q$ @ - #获取图片名
* f( [" g5 i" R3 y - def getname(self,url):
! N7 h! N* {. ]* f6 a: i- C5 Y - name=url.split("/")[-1].split(".")[0]# S2 t, c' d9 a+ l
- return name
复制代码 . W) p& c2 l5 r8 E# o- l
6 Q5 Q- [( {7 `7 S$ }9 Z |