Python + pyspider某小说站的爬虫,入数据库,火车头发布,资...
Python + pyspider某小说站的爬虫,入数据库,火车头发布,资源下载到本地,另可写爬虫!#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2019-05-05 21:43:11
# Project: XiaoShuo
from pyspider.libs.base_handler import *
import pymysql
import random
import datetime
import urllib2,HTMLParser,re
import os
import sys
import re
import codecs
import requests
import json
class Handler(BaseHandler):
global Datos
global P_dir
P_dir = '/Tools/Debug/'#采集时候图片保持到本地的路径
global Datos
Datos = {}
headers= {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate, sdch',
'Accept-Language':'zh-CN,zh;q=0.8',
'Cache-Control':'max-age=0',
'Connection':'keep-alive',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'
}
crawl_config = {
'headers' : headers,
'timeout' : 300
}
def add_BookFile(self,Bookname, BookIDs, img, Locaimg, Book_Dates):
db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
try:
cursor = db.cursor()
#注意此处字符串的占位符要加双引号"%s"
sql = 'insert into BookFile(Bookname, BookID, img, Locaimg, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, BookIDs, img, Locaimg, Book_Dates);
# print(sql)
cursor.execute(sql)
#qid = cursor.lastrowid
#print(qid)
db.commit()
except Exception as err:
print("Error %s for execute sql: %s" % (err, sql))
db.rollback()
def add_comment(self,Bookname, Booktitle, BookID, Titleid, Book_Date):
db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
try:
cursor = db.cursor()
#注意此处字符串的占位符要加双引号"%s"
sql = 'insert into BookTitle(Bookname, Booktitle, BookID, Titleid, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, Booktitle, BookID, Titleid, Book_Date);
# print(sql)
cursor.execute(sql)
#qid = cursor.lastrowid
#print(qid)
db.commit()
except Exception as err:
print("Error %s for execute sql: %s" % (err, sql))
db.rollback()
def add_question(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date):
db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
try:
cursor = db.cursor()
#注意此处字符串的占位符要加双引号"%s"
sql = ' INSERT INTO BookConte (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date);
print(sql)
cursor.execute(sql)
print(cursor.lastrowid)
db.commit()
except Exception as err:
# except:
# print('Failed')
print("Error %s for execute sql: %s" % (err, sql))
db.rollback()
def add_locoy(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover):
reload(sys)
sys.setdefaultencoding("gbk")
locoy_url = 'http://www.******.net/locoy/?my=book'#697火车头发接口地址
locoy_data = {
'my_u':'用户名', #后台用户名
'my_p':'密码', #后台密码
'subject_669977_net':Bookname.encode('gbk', 'ignore'),
'caid':Cater_Name.encode('gbk', 'ignore'),
'title_669977_net':Booktitle.encode('gbk', 'ignore'),
'article':BookConte.encode('gbk', 'ignore'),
'author':Book_author.encode('gbk', 'ignore'),
'ready_1':Book_Palabras.encode('gbk', 'ignore'),
'thumb':Book_img,
'content':Book_Introduction.encode('gbk', 'ignore'),
'abover':abover.encode('gbk', 'ignore')
}
res = requests.post(locoy_url, data=locoy_data)
print res.text
print res.content
# print Dsd
return res
def __init__(self):
self.base_url1 = 'https://www.****.cc/'
self.base_url2 = '/'
self.CaterId = []
self.CaterIds = ['xuanhuan', 'wuxia', 'yanqing', 'lishi', 'kehuan', 'kongbu', 'nvsheng']
self.page_num = 1
self.total_num = 200
@every(minutes=8 * 60)
def on_start(self):
global Cater_Name
Cater_Name = []
while self.page_num <= self.total_num:
for self.CaterId in self.CaterIds:
if self.CaterId== 'xuanhuan':
Cater_Name = '玄幻'
if self.CaterId== 'wuxia':
Cater_Name = '武侠'
if self.CaterId== 'lishi':
Cater_Name = '历史'
if self.CaterId== 'yanqing':
Cater_Name = '都市'
if self.CaterId== 'nvsheng':
Cater_Name = '都市'
if self.CaterId== 'kehuan':
Cater_Name = '科幻'
if self.CaterId== 'kongbu':
Cater_Name = '游戏'
print self.CaterId
url = self.base_url1 + str(self.CaterId) + self.base_url2 + str(self.page_num) + "/"
self.crawl(url, callback=self.list_Caterg,save=Cater_Name)
self.page_num += 1
def list_Caterg(self, response):
Cater_Name = response.save
for each in response.doc('.pic-list a').items():
self.crawl(each.attr.href, callback=self.list_Caterg_detail,save=Cater_Name)
def list_Caterg_detail(self, response):
Cater_Name = response.save
# print Cater_Name
Bookname = response.doc('h1').text()
print Bookname
Book_author = response.doc('.authorname > a').text()
# print Book_author
Book_Introduction = response.doc('.book-intro > div').text()
# print Book_Introduction
Book_Synopsis = response.doc('b').eq(1).text()
# print Book_Synopsis
Book_Palabras = response.doc('.booktitle p').text().split(' ').split('|')
# print Book_Palabras
BookIDs = response.url.split("xiaoshuo/")[-1].split("/") #小说ID
# print BookIDs
Book_Dates = str(datetime.datetime.now())
for imgs in response.doc('.bigpic > img').items():
img = imgs.attr.src
print img
#小说封面下载
extension = self.getExtension(img)
name = self.getname(img)
file_name = name + "." + extension
imgDir = P_dir + name
Locaimg = imgDir + "/" + file_name
print Locaimg
if(self.download(P_dir, imgDir, file_name, img)): #这2行可注译,图片下载到本地
print('attachment url is ' + img) #
Datos = {
"Cater_Name":Cater_Name,
"Book_author":Book_author,
"Book_Introduction":Book_Introduction,
"Book_Synopsis":Book_Synopsis,
"Book_Palabras":Book_Palabras,
"img":img,
}
self.add_BookFile(Bookname, BookIDs, img, Locaimg, Book_Dates)#这行可注译,数据库发布接口,方便其他系统的发布
for each in response.doc('diva').items():
self.crawl(each.attr.href, callback=self.index_page,save=Datos)
@config(age=8 * 60 * 60)
def index_page(self, response):
Datos = {
"Cater_Name":response.save['Cater_Name'],
"Book_author":response.save['Book_author'],
"Book_Introduction":response.save['Book_Introduction'],
"Book_Synopsis":response.save['Book_Synopsis'],
"Book_Palabras":response.save['Book_Palabras'],
"img":response.save['img'],
}
for each in response.doc('.chapter-list li:first-child a').items():
# for each in response.doc('.chapter-lista').items():
self.crawl(each.attr.href, callback=self.detail_page,save=Datos)
@config(priority=2)
@catch_status_code_error
def detail_page(self, response):
NewRe1 = u'哈书'
NewRe2 = u'huhjsd.CC'
NewRe3 = r'^\\n\\n'
NewRe5 = u'小说网'
NewRe6 = u'fgdfgf'
NewRe7 = u'fgfgf'
NewRe8 = u'ffhgf'
NewRe4 = r'[\f\t\v+\.\{\(\)\}\!\/_,$%^*(+\"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+'
ReC1 = u'静思'
ReC2 = u'aghgf.com'
ReC3 = u'aghgfh.com'
ReC4 = u''
ReC5 = u'文学网'
ReC6 = r'<BR>'
Bookname = response.doc('.readlocation a').eq(2).text() #小说名称
print Bookname
Cater_Name = response.save['Cater_Name'] # 小说分类
Book_author = response.save['Book_author'] #小说作者
Book_Introduction1 = response.save['Book_Introduction'] #小说简介
Book_Synopsis = response.save['Book_Synopsis'] #最近更新
Book_Palabras = response.save['Book_Palabras'] #小说字数
Bookurl = response.url #小说网址
Booktitle = response.doc('.article-title').text() #章节名称
BookID = response.doc('.readset-r span').text() #小说ID
BookConte1 = response.doc('.article-con').text() #小说章节内容
abover = response.doc('.article-title').text() + response.save['Book_Synopsis'] + response.save['Book_Palabras'] + response.save['Book_Introduction'] #小说状态(连载还是完成)
Book_Date = str(datetime.datetime.now()) # 采集时间
BookConte2 = BookConte1.replace(NewRe1 , ReC1)
BookConte3 = BookConte2.replace(NewRe2 , ReC2)
BookConte5 = BookConte3.replace(NewRe5 , ReC5)
BookConte6 = BookConte5.replace(NewRe6 , ReC2)
BookConte7 = BookConte6.replace(NewRe7 , ReC2)
BookConte8 = BookConte7.replace(NewRe3 , ReC6)
BookConte4 = re.sub(NewRe4 , ReC4 , BookConte8)
BookConte = BookConte4.replace("\n\n","<br>")
print BookConte
Book_Introduction2 = Book_Introduction1.replace(NewRe1 , ReC1)
Book_Introduction3 = Book_Introduction2.replace(NewRe2 , ReC2)
Book_Introduction4 = Book_Introduction3.replace(NewRe3 , ReC3)
Book_Introduction = re.sub(NewRe4 , ReC4 , Book_Introduction4)
Titleid = response.url.split(BookID + "/")[-1].split("/")
Book_img = response.save['img'],#小说图片
#insert into MySQL 小说入库
self.add_question(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布
self.add_comment(Bookname, Booktitle, BookID, Titleid, Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布
#post提交发布
self.add_locoy(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover)#这行可注译,火车头发布接口,不需要可取消
Datos = {
"Cater_Name":response.save['Cater_Name'],
"Book_author":response.save['Book_author'],
"Book_Introduction":response.save['Book_Introduction'],
"Book_Synopsis":response.save['Book_Synopsis'],
"Book_Palabras":response.save['Book_Palabras'],
"img":response.save['img'],
}
for each in response.doc('.articlebtn > a:nth-child(4)').items():
self.crawl(each.attr.href, callback=self.detail_page,save=Datos)
return {
"Cater_Name":Cater_Name,
"Bookname":Bookname,
"Book_author":Book_author,
"Book_Introduction":Book_Introduction,
"Book_Synopsis":Book_Synopsis,
"Book_Palabras":Book_Palabras,
"Book_img":Book_img,
"Bookurl": response.url,
"Booktitle": Booktitle,
"BookID": BookID,
"BookConte": BookConte,
"Titleid": Titleid,
"abover":abover,
# "Book_Date" = str(datetime.datetime.now()),
}
def download(self, P_dir, imgDir, file_name, Book_img):
if not os.path.exists(imgDir):
os.makedirs(imgDir)
file = imgDir + "/" + file_name
# print file
f = open(file, 'wb+')
imag = requests.get(Book_img)
f.write(imag.content)
f.close()
#保存图片前
def save_imgs(self,response):
content = response.content
file_name = response.save["file_name"]
imgDir = response.save["imgDir"]
file_path = imgDir + file_name
self.save_img(content,imgDir,file_path)
#保存图片
def save_img(self,content,imgDir,path):
if not os.path.exists(imgDir):
os.makedirs(imgDir)
f = open(path,"wb" )
f.write(content)
f.close()
#获取url后缀名
def getExtension(self,url):
extension = url.split(".")[-1]
return extension
#获取图片名
def getname(self,url):
name=url.split("/")[-1].split(".")
return name
页:
[1]