需求是这样的: 需要获取最近20天的贴吧的主题贴以及直接回复(过滤回复的回复),输出数据到MySQL 这里以百度贴吧-上海吧为例子。
上海吧的结构如下,主题帖和回复都带有分页。如下所示:
定义全局变量(settings.py):
# -*- coding: utf-8 -*- # Scrapy settings for tieba project # # For simplicity, this file contains only the most important settings by # default. All the other settings are documented here: # # http://doc.scrapy.org/en/latest/topics/settings.html # BOT_NAME = 'tieba' SPIDER_MODULES = ['tieba.spiders'] NEWSPIDER_MODULE = 'tieba.spiders' START_URL = 'http://tieba.baidu.com/f?ie=utf-8&kw=%E4%B8%8A%E6%B5%B7' #START_URL = 'http://tieba.baidu.com/f?ie=utf-8&kw=%E4%B8%8A%E6%B5%B7' #START_URL = 'http://tieba.baidu.com/f?ie=utf-8&kw=%E4%B8%8A%E6%B5%B7%E4%BA%A4%E9%80%9A%E5%A4%A7%E5%AD%A6' TOTAL_DAYS = "20" ITEM_PIPELINES = ['tieba.pipelines.MySQLDBPipeline'] MySQL_SERVER = "localhost" MySQL_SERVER_PORT = 3306 MySQL_SERVER_DB = "tieba" MySQL_SERVER_USER = "mysql" MySQL_SERVER_PWD = "xyz" # Crawl responsibly by identifying yourself (and your website) on the user-agent USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; rv:35.0) Gecko/20100101 Firefox/35.0'
数据抓取部分(TiebaSpider.py 只完成主题帖,回复内容还未准备) :
#coding=utf-8 from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors import LinkExtractor from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from scrapy.selector import HtmlXPathSelector from tieba.items import SubjectItem from tieba.items import CommentItem from tieba import settings import scrapy import json class TiebaSpider(CrawlSpider): name = 'tieba' allowed_domains = ['tieba.baidu.com'] #备注:那些带有推广的帖子现在看起来都不是这个域名下的,所以主题文章已经过滤了推广贴 start_urls = [settings.START_URL] #这里假设20天内主题帖数量<1000*50,可以根据实际调整或获取页面上每个主题帖的时间来计算出具体需要多少页! for x in range(0, 1000): start_urls.append(settings.START_URL + "&pn=" + str((x+1) * 50)) rules = [Rule(LinkExtractor(allow=['/p/\d+']), 'parse_subject_shanghai')]#这里只解析主题贴 def parse_subject_shanghai(self, response): try: torrent = SubjectItem() torrent['url'] = response.url torrent['id'] = response.url.split('/p')[1].split('/')[1].split('?')[0] torrent['commentNum'] = response.xpath("//*[@id='thread_theme_5']/div[1]/ul/li[2]/span[1]/text()").extract()[0] #这里用id定位没有找到content,一个可能原因是用了自定义tag cc torrent['content'] = response.xpath("//*/cc/div/text()").extract()[0] dataField = json.loads(str(response.xpath("//*[@id='j_p_postlist']/div[1]/@data-field").extract()[0])) #很多信息在html source里没有,是在客户端用 js 生成 torrent['created'] = dataField['content']['date'].strip()+":00" torrent['title'] = response.xpath("//*[@id='j_core_title_wrap']/div/h1/text()").extract()[0] torrent['tiebaName'] = response.xpath("//*[@id='container']/div/div[1]/div[2]/div[2]/a/text()").extract()[0].strip() torrent['authorName'] = response.xpath("//*[@id='j_p_postlist']/div[1]/div[2]/ul/li[3]/a/text()").extract()[0] torrent['authorUrl'] = response.xpath("//*[@id='j_p_postlist']/div[1]/div[2]/ul/li[3]/a/@href").extract()[0] torrent['authorAvatar'] = response.xpath("//*[@id='j_p_postlist']/div[1]/div[2]/ul/li[1]/div/a/img/@src").extract()[0] if not "http://tieba.baidu.com" in torrent['authorUrl']: torrent['authorUrl'] = "http://tieba.baidu.com" + torrent['authorUrl'] hxs = HtmlXPathSelector(response) subject_post_div = hxs.select("//*/cc/div")[0] imgs = ['','',''] index = 1 for img in subject_post_div.select(".//img/@src"): if index > 3: break imgs[index-1] = img.extract() index += 1 torrent['image1'],torrent['image2'],torrent['image3'] = imgs #到这里已经完成主题帖的解析 totalCommentPage = int(response.xpath("//div[@id='thread_theme_5']/div[1]/ul/li[2]/span[2]/text()").extract()[0]) for x in range(2, totalCommentPage): url = torrent['url'] + ("?pn=%s" % x) yield scrapy.Request(url=url, callback=self.parse_comments_shanghai) except: torrent['id'] = None pass yield torrent def parse_comments_shanghai(self,response): try: items = [] print response hxs = HtmlXPathSelector(response) print "---------------------------------------------------" j_p_postlist = hxs.select("//div[@id='j_p_postlist']").select(".//div[@class='l_post l_post_bright ']") print "----------------------------------------got it",j_p_postlist for childNode in j_p_postlist: print ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>" print childNode.extract() #for content in j_p_postlist.select(".//div[@id='l_post l_post_bright']/text()"): #print '=-===content',content except: for item in items: item['id'] = None pass return items
数据存取部分(只完成主题帖)
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html import settings from scrapy import log import traceback import MySQLdb import MySQLdb.cursors from twisted.enterprise import adbapi from datetime import datetime def strtodatetime(datestr,format): return datetime.strptime(datestr,format) class MySQLDBPipeline(object): def __init__(self): self.date_time_format = "%Y-%m-%d %H:%M:%S" self.dbpool = adbapi.ConnectionPool('MySQLdb', host = settings.MySQL_SERVER, db = settings.MySQL_SERVER_DB, port = settings.MySQL_SERVER_PORT, user = settings.MySQL_SERVER_USER, passwd = settings.MySQL_SERVER_PWD, cp_reconnect = True, cursorclass = MySQLdb.cursors.DictCursor, charset = 'utf8', use_unicode = True) def process_item(self, item, spider): # run db query in thread pool query = self.dbpool.runInteraction(self._conditional_insert, item).addErrback(self.handle_error) return item def _conditional_insert(self, tx, item): if item.get('id') and item.get('created'): today = datetime.now() postDay = strtodatetime(item.get('created'), self.date_time_format) #从这里限制只更新20天内的数据 if (today - postDay).days <= int(settings.TOTAL_DAYS): args= (item['id'], item['title'], item['url'], item['tiebaName'], item['authorName'], item['authorUrl'], item['authorAvatar'], item['content'], item['created'], item['image1'], item['image2'], item['image3'], item['commentNum'], item['commentNum'] ) sql = '''insert into tieba_articles(id, title, url, tiebaName, authorName, authorUrl, authorAvatar,content,created,image1,image2,image3,commentNum) VALUES('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s', '%s') ON DUPLICATE KEY UPDATE commentNum = '%s' ''' % args tx.execute(sql) def handle_error(self, e): log.err(e)
忘记了,补上数据结构部分(items.py):
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # http://doc.scrapy.org/en/latest/topics/items.html import scrapy class SubjectItem(scrapy.Item): id = scrapy.Field() url = scrapy.Field() title = scrapy.Field() tiebaName = scrapy.Field() authorName = scrapy.Field() authorUrl = scrapy.Field() authorAvatar = scrapy.Field() commentNum = scrapy.Field() created = scrapy.Field() content = scrapy.Field() image1 = scrapy.Field() image2 = scrapy.Field() image3 = scrapy.Field() class CommentItem(scrapy.Item): authorName = scrapy.Field() authorUrl = scrapy.Field() authorAvatar = scrapy.Field() content = scrapy.Field() index = scrapy.Field() article_id = scrapy.Field() created = scrapy.Field()
总结: scrapy定义了清晰的类层次结构,使得开发者只需要关注业务逻辑本身。 对于分页数据处理,可以使用两种模式: 1)把已知的所有url添加到一个列表; 2)使用yield scrapy.Request(xargs)
数据解析可以使用自身的Xpath,也可以选用其他第三方module. 如BeautifulSoup.
相关推荐
Scrapy爬取新浪微博用户信息、用户微博及其微博评论转发
该资源使用scrapy爬取伯乐在线文章并保存到mysql
python scrapy爬取电影天堂所有电影
scrapy 爬取酷狗T500音乐,并把音乐下载到本地,其中下载的音乐信息保存到mongoDB
Python,使用Scrapy爬取Boss直聘数据。 资源讲解地址:https://www.cnblogs.com/swarmbees/p/10011898.html
在今日头条上输入关键词,爬取与关键词相关的新闻各类信息和内容页。
基于scrapy爬取51job爬虫系统源码.zip 基于scrapy爬取51job爬虫系统源码.zip 基于scrapy爬取51job爬虫系统源码.zip 基于scrapy爬取51job爬虫系统源码.zip 基于scrapy爬取51job爬虫系统源码.zip 基于scrapy爬取51job...
主要介绍了Python爬虫实例——scrapy框架爬取拉勾网招聘信息的相关资料,文中讲解非常细致,代码帮助大家更好的理解和学习,感兴趣的朋友可以了解下
scrapy爬取百度疫情数据平台 数据存入数据库,需要首先配置数据库, scrapy startproject yqsj 命令行执行代码文件 项目是国内国外疫情数据,爬取执行时间长,需要耐心等待
基于Python的scrapy爬虫框架实现爬取招聘网站的信息到数据库
Python scrapy爬取豆瓣电影top250,非常简单,2分钟搞定
Scrapy爬取去哪儿网,并使用Django框架+PyEcharts实现可视化大屏。 Scrapy爬取去哪儿网,并使用Django框架+PyEcharts实现可视化大屏。 Scrapy爬取去哪儿网,并使用Django框架+PyEcharts实现可视化大屏。 Scrapy爬取...
使用python的scrapy爬取文本保存为txt文件 编码工具 Visual Studio Code 实现步骤 1.创建scrapyTest项目 在vscode中新建终端并依次输入下列代码: scrapy startproject scrapyTest cd scrapyTest code 打开项目...
使用scrapy框架爬取了问医网上的一些医疗疾病数据,里面涉及到了分页,分块,多级嵌套爬取
python爬虫学习笔记-scrapy框架(1) python scrapy 爬虫 python爬虫学习笔记-scrapy框架(1) python scrapy 爬虫 python爬虫学习笔记-scrapy框架(1) python scrapy 爬虫 python爬虫学习笔记-scrapy框架(1) python ...
使用scrapy爬取全国所有城市的天气信息,爬取的是中国天气网的信息。
用scrapy爬取下载某图片网站的全部图片。代码中已经去除了具体网站的信息,代码只供学习用。
运用scrapy框架编写腾讯招聘信息,招聘位置,招聘地区,招聘链接,人数,等等信息,完整程序,直接运行即可完整打印招聘信息.
工程代码是使用scrapy构建的动态爬取百度图片的demo,可以设置关键字。工程是一个完整工程,已经对链接进行了去重处理。且对scrapy进行了优化。需要学习的同学可以作为demo参考。