一 创建爬虫:
(资料图片仅供参考)
cd到我们存放项目的文件夹 然后 scrapy startproject BQG cd:BQG 然后 scrapy genspider biquge biduo.cc
二 使用pycharm打开我们的项目目录
1setting设置
这里我是将我的整个文件贴到这里.
# -*- coding: utf-8 -*-from fake_useragent import UserAgent# Scrapy settings for BQG project## For simplicity, this file contains only settings considered important or# commonly used. You can find more settings consulting the documentation:## https://doc.scrapy.org/en/latest/topics/settings.html# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html# https://doc.scrapy.org/en/latest/topics/spider-middleware.htmlBOT_NAME = "BQG"SPIDER_MODULES = ["BQG.spiders"]NEWSPIDER_MODULE = "BQG.spiders"LOG_LEVEL = "WARNING"# Crawl responsibly by identifying yourself (and your website) on the user-agentUSER_AGENT = UserAgent().chrome# Obey robots.txt rulesROBOTSTXT_OBEY = False# Configure maximum concurrent requests performed by Scrapy (default: 16)#CONCURRENT_REQUESTS = 32# Configure a delay for requests for the same website (default: 0)# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay# See also autothrottle settings and docsDOWNLOAD_DELAY = 3# The download delay setting will honor only one of:#CONCURRENT_REQUESTS_PER_DOMAIN = 16#CONCURRENT_REQUESTS_PER_IP = 16# Disable cookies (enabled by default)#COOKIES_ENABLED = False# Disable Telnet Console (enabled by default)#TELNETCONSOLE_ENABLED = False# Override the default request headers:# DEFAULT_REQUEST_HEADERS = {# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",# "Accept-Language": "en",# }# Enable or disable spider middlewares# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html#SPIDER_MIDDLEWARES = {# "BQG.middlewares.BqgSpiderMiddleware": 543,#}# Enable or disable downloader middlewares# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#DOWNLOADER_MIDDLEWARES = {# "BQG.middlewares.BqgDownloaderMiddleware": 543,#}# Enable or disable extensions# See https://doc.scrapy.org/en/latest/topics/extensions.html#EXTENSIONS = {# "scrapy.extensions.telnet.TelnetConsole": None,#}# Configure item pipelines# See https://doc.scrapy.org/en/latest/topics/item-pipeline.htmlITEM_PIPELINES = { "BQG.pipelines.BqgPipeline": 300,}# Enable and configure the AutoThrottle extension (disabled by default)# See https://doc.scrapy.org/en/latest/topics/autothrottle.html#AUTOTHROTTLE_ENABLED = True# The initial download delay#AUTOTHROTTLE_START_DELAY = 5# The maximum download delay to be set in case of high latencies#AUTOTHROTTLE_MAX_DELAY = 60# The average number of requests Scrapy should be sending in parallel to# each remote server#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0# Enable showing throttling stats for every response received:#AUTOTHROTTLE_DEBUG = False# Enable and configure HTTP caching (disabled by default)# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings#HTTPCACHE_ENABLED = True#HTTPCACHE_EXPIRATION_SECS = 0#HTTPCACHE_DIR = "httpcache"#HTTPCACHE_IGNORE_HTTP_CODES = []#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
2 spider文件写法
这里我们简单分析:
(1)parse函书获取首页的链接,在这里我们获取小说分类的链接url(https://www.biduo.cc/book_1_1/),小说章节页面的url.小说分类页面的链接(https://www.biduo.cc/biquge/56_56606/),我们继续返回parse函数解析,获取我们的俩种目标链接.小说章节也得链接,我们由下一个函书解析get_novel.
(2)get_novel次函数只要获取小说名称,小说首页的链接(https://www.biduo.cc/biquge/56_56606/c100952.html).(这里不获取所有章节的链接,是因为scrapy爬虫是多线程的,如果获取所有章节链接,那么返回的时候,顺序不对.)通过首页的链接,在get_page_content函书获取整个小说的章节内容.
(3)get_page_content此函数获取小说每一章节的名称,内容.这里获取首页之后,然后通过翻页获取下一页小说的内容.可以避免返回数据的顺序问题.
# -*- coding: utf-8 -*-import scrapyfrom lxml import etreeimport reclass BiqugeSpider(scrapy.Spider): name = "biquge" allowed_domains = ["biduo.cc"] start_urls = ["https://www.biduo.cc/"] def parse(self, response): # print(response.text) pat = r"href="/book_\d+_\d+/">" pat = r"/book_\d+_\d+/" tab_lists = re.findall(pat, response.text) print("****************************") for li in tab_lists: yield scrapy.Request( url="https://www.biduo.cc/"+ li, callback=self.parse, ) pat1 = r"/biquge/\d+_\d+\/" t_lists = re.findall(pat1, response.text) for li in t_lists: # print(li) yield scrapy.Request( url="https://www.biduo.cc" + li, callback=self.get_novel, ) def get_novel(self,response): novel_url = response.url novel_title = response.xpath("//div[@id="info"]/h1/text()").extract_first() # novel_lists = response.xpath("//div[@id="list"]/dl/dd/a/@href").extract() novel_first = "https://www.biduo.cc" + response.xpath("//div[@id="list"]/dl/dd[1]/a/@href").extract_first() yield scrapy.Request( url = novel_first, callback=self.get_page_content, meta={"novel_title":novel_title,"novel_url":novel_url} ) def get_page_content(self,response): item = {} item["novel_title"] = response.meta["novel_title"] item["novel_url"] = response.meta["novel_url"] item["page_title"] = response.xpath("//h1/text()").extract_first() item["page_url"] = response.url item["page_content"] = "".join(response.xpath("//div[@id="content"]/text()").extract()).replace("\xa0","") # item["page_content"] = response.xpath("//div[@id="content"]/text()").extract() yield item next1 = response.xpath("//div[@class="bottem2"]/a[3]/@href").extract_first() # //*[@id="wrapper"]/div[4]/div/div[2]/div[1]/a[3] print(next1) next_url = "https://www.biduo.cc" + next1 # response.urljoin(next_url) if next_url != item["novel_url"]: yield scrapy.Request( url = next_url, callback=self.get_page_content, meta={"novel_title":item["novel_title"],"novel_url":item["novel_url"]} )
3pipline文件
# -*- coding: utf-8 -*-# Define your item pipelines here## Don"t forget to add your pipeline to the ITEM_PIPELINES setting# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.htmlclass BqgPipeline(object): def start(self,item,spider): self.file = open("novels/{}.txt".format(item["novel_title"]), "a+", encoding="utf-8") def process_item(self, item, spider): self.file = open("novels/{}.txt".format(item["novel_title"]), "a+", encoding="utf-8") print(item["page_title"]) self.file.write(item["page_title"]+"\n") self.file.write(item["page_url"]+"\n") self.file.write(item["page_content"]+"\n") return item def closed(self,item,spider): self.file.close()
4运行效果:
5 源码已上传到我的资源里面.如果需要可以下载.