Created April 2, 2015 01:46
Scrapy Scraper for Buzzfeed
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.lxmlhtml import LxmlLinkExtractor
from buzzLinks.items import BuzzlinksItem
from urlparse import urlparse
# Spider
class DmozSpider(CrawlSpider):
name = "buzzfeedNews"
allowed_domains = [""]
start_urls = [
page_count = 0
# Determines what pages the spider should crawl
rules = (
Rule(LxmlLinkExtractor(allow_domains=('') ), callback='parse_item'),
# Method to process each page
def parse_item(self, response):
items = []
depth = response.meta["depth"]
referring_url = response.request.headers.get('Referer', None)
current_url = response.url
title = response.xpath('//div[@id="buzz_header"]//h1/text()').extract()
for link in response.xpath('//div[@id="buzz_sub_buzz"]//div[not(contains(@class,"share-box"))]//a[not(@rel="nofollow")]/@href[not(contains(text(),"buzzfeed") or contains(text(),"buzzfed"))]'):
l = link.extract()
if str(l) != "javascript:;":
item = BuzzlinksItem()
item["depth"] = depth
item["current_url"] = current_url
item["referring_url"] = referring_url
item["link"] = link.extract()
item["article_title"] = title
parsed_uri = urlparse(link.extract())
domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
item["link_domain"] = domain
item["page_id"] = self.crawler.stats.get_value('pages_crawled')
return items
import scrapy
from scrapy.item import Item, Field
# Item
class BuzzlinksItem(scrapy.Item):
article_title = scrapy.Field()
link = scrapy.Field()
depth = scrapy.Field()
current_url = scrapy.Field()
referring_url = scrapy.Field()
link_domain = scrapy.Field()
page_id = scrapy.Field()
from firebase import firebase
import json
# Pipeline
class BuzzlinksPipeline(object):
def process_item(self, item, spider):
d = {}
d['article_title'] = str(item['article_title'])
d['link'] = str(item['link'])
d['current_url'] = str(item['current_url'])
d['referring_url'] = str(item['referring_url'])
d['link_domain'] = str(item['link_domain'])
d['depth'] = item['depth']
fb = firebase.FirebaseApplication('', None)
result ='/buzzfeedLinks', d)
return item
BOT_NAME = 'buzzLinks'
SPIDER_MODULES = ['buzzLinks.spiders']
NEWSPIDER_MODULE = 'buzzLinks.spiders'
ITEM_PIPELINES = {'buzzLinks.pipelines.BuzzlinksPipeline':1}
'scrapy.contrib.corestats.CoreStats': 500,
'scrapy.webservice.WebService': 500,
'scrapy.telnet.TelnetConsole': 500,
