Web scraping with scrapy -
from scrapy.spider import basespider scrapy.selector import selector scrapy.exceptions import closespider scrapy.http import request botg.items import botgitem url = "http://store.tcgplayer.com/magic/born-of-the-gods?pagenumber=%d" class myspider(basespider): name = "tcg" allowed_domains = ["tcgplayer.com"] start_urls = [url % 1] def __init__(self): self.page_number = 1 def parse(self, response): print self.page_number print "--------------------break-------------------------" sel = selector(response) titles = sel.xpath("//div[@class='magiccard']") if not titles: raise closespider('no more pages') title in titles: item = botgitem() item["cardname"] = title.xpath(".//li[@class='cardname']/a/text()").extract()[0] item["rarity"] = title.xpath(".//li[@href='/magic/born-of-the-gods']/text()").extract() vendor = title.xpath(".//tr[@class='vendor ']") item["price"] = vendor.xpath("normalize-space(.//td[@class='price']/text())").extract() item["quantity"] = vendor.xpath("normalize-space(.//td[@class='quantity']/text())").extract() item["shipping"] = vendor.xpath("normalize-space(.//span[@class='shippingamount']/text())").extract() item["condition"] = vendor.xpath("normalize-space(.//td[@class='condition']/a/text())").extract() item["vendors"] = vendor.xpath("normalize-space(.//td[@class='seller']/a/text())").extract() yield item self.page_number += 1 yield request(url % self.page_number)
i using code scrape page, not able "rarity" scrape. appreciated. else seems work, tell me "[0]" after .extract() in line cardname item.
for rarity field, suggest:
- you text representation of
<ul>
containing<li class="cardname">
, - extract what's after "rarity: " regex
something this:
for title in titles: item = botgitem() item["rarity"] = title.xpath('string(.//ul[li[@class="cardname"]])').re(r'rarity:\s*(\w+)')
about 2nd question, .extract()
extracts list of strings, [0]
selects 1st element of list
Comments
Post a Comment