python3,python实践-轩宇阅读网爬取单册小说

 2023-09-23 阅读 38 评论 0

摘要:import requests import re,os from bs4 import BeautifulSoup#红楼梦 # URL = 'https://www.xyyuedu.com/wgmz/dongyeguiwu/baiyexingxs/' # DIR = r'D:\test\0602\baiyexing'#白夜行 # URL = 'https://www.xyyuedu.com/gdmz/sidamingzhu/hl
import requests
import re,os
from bs4 import BeautifulSoup#红楼梦
# URL = 'https://www.xyyuedu.com/wgmz/dongyeguiwu/baiyexingxs/'
# DIR = r'D:\test\0602\baiyexing'#白夜行
# URL = 'https://www.xyyuedu.com/gdmz/sidamingzhu/hlmeng/index.html'
# DIR = r'D:\test\0602\hongloumeng'#过去我死去的家
# URL = 'https://www.xyyuedu.com/wgmz/dongyeguiwu/guoquwosiqudejia/index.html'
# DIR = r'D:\test\0602\gqwsqdj'#壮丽的奥力诺克河
URL = 'https://www.xyyuedu.com/gdmz/yingliechuan/index.html'
DIR = r'D:\test\mingzhu\中国古典文学\英烈传'def getHomeHtml(url,isGetStatusCode=False):'''请求页面:param url::param isGetStatusCode::return:'''headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'}respose = requests.get(url, headers=headers)if isGetStatusCode:return respose.status_codehomeHtml = respose.content.decode('gbk', 'ignore')return homeHtmldef getChapterUrlList(homeHtml):'''获取各章节url:param homeHtml::return:'''dict_chapter = {}pattern = r'<a href="(/[A-Za-z]+/[A-Za-z]+/[A-Za-z]+/[0-9]+.html)"'urlList = re.findall(pattern, homeHtml)if(len(urlList) == 0):pattern = r'<a href="(/[A-Za-z]+/[A-Za-z]+/[0-9]+.html)"'urlList = re.findall(pattern, homeHtml)urlList_re = []for url in urlList:urlList_sub = []url = 'https://www.xyyuedu.com' + urlurlList_sub.append(url)for i in range(2,10):url_next = url.replace('.html','')+'_%s'%(i)+'.html'statusCode = getHomeHtml(url_next,isGetStatusCode=True)print('%s   %s'%(url_next,statusCode))if statusCode !=(200):breakelse:urlList_sub.append(url_next)urlList_re.append(urlList_sub)pattern_chapter = r'title="(.+?)"   target="_blank"'chapterList = re.findall(pattern_chapter, homeHtml)for i in range(len(chapterList)):dict_chapter[str(i+1)+chapterList[i].replace('?','')] = urlList_re[i]return dict_chapterdef saveChapterText(name,urlList):'''将各章节文本记录在txt文件中:param name::param urlList::return:'''for url in urlList:chapterHtml = getHomeHtml(url)bf = BeautifulSoup(chapterHtml)isDown = Falseif writeDown(name,bf.find_all('p')) or writeDown_mode2(name,bf.find_all('div',id="onearcxsbd")):print('下载%s成功'%(name))isDown = Trueelse:print('未获得文本,切换模式下载')if not isDown:print('失败,未获得文本')def writeDown_mode2(name,chapterTextList):'''第二种文本解析方式该模式下,章节文本存储在整个<div>中:param name::param chapterTextList::return:'''isMatch = Falsefor text in chapterTextList:text = \str(text).replace('<p>', '').replace('</p>', '').replace('<br/>', '').replace('&lt', '').replace(' ', '').split('<!--分页-->')[0].replace('<divclass="onearcxsbd"id="onearcxsbd">','')if len(text) != 0:writeInText(name, text + '\r\n')isMatch = Truereturn isMatchdef writeDown(name,chapterTextList):'''第一种文本解析方式该模式下,章节文本存储在整个<p>中:param name::param chapterTextList::return:'''isMatch = Falsefor text in chapterTextList:text = str(text).replace('<p>', '').replace('</p>', '').replace('<br/>', '').replace('&lt', '').replace(' ', '').split('<!--分页-->')[0]if ('微信扫码关注' not in text) and ('互联网信息管理办法' not in text) and ('声明' not in text) and ('分页' not in text) and ('开始' not in text) and ('回目录' not in text) and ('轩宇阅读网' not in text) and (len(text) != 0) :writeInText(name, text + '\r\n')isMatch = Truereturn isMatchdef writeInText(name,text):'''文件操作:param name::param text::return:'''fileName = r'%s\%s.txt'%(DIR,name)print('写入文件%s'%(fileName))with open(fileName, 'a+', encoding='utf-8') as fb:fb.write(text)def reptileNovel(url,dir):'''提供向外接口函数:param url::param dir::return:'''global URLglobal DIRDIR = dirURL = urlif not os.path.exists(dir):os.makedirs(dir)homeHtml = getHomeHtml(url)dict_chapter = getChapterUrlList(homeHtml)for chapterName in dict_chapter.keys():# print(chapterName, dict_chapter[chapterName])saveChapterText(chapterName, dict_chapter[chapterName])if __name__ == '__main__':reptileNovel(URL,DIR)

,执行结果如下图

版权声明:本站所有资料均为网友推荐收集整理而来,仅供学习和研究交流使用。

原文链接:https://808629.com/94977.html

发表评论:

本站为非赢利网站,部分文章来源或改编自互联网及其他公众平台,主要目的在于分享信息,版权归原作者所有,内容仅供读者参考,如有侵权请联系我们删除!

Copyright © 2022 86后生记录生活 Inc. 保留所有权利。

底部版权信息