Scraping For Scooters From Hamrobazaar

Hamrobazaar.com is one of the most popular forum based e-commerce website from Nepal. One can find various items in this site. But finding the right item that is on sale, for example say finding a Scooter, it can be quite a challenging task because of the number of entries and variations.

So, here is a Scrapy script for your rescue!

 


import scrapy
import re
from hb_scrape.items import HbScrapeItem
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
from scrapy.http.request import Request
import json
import requests

class scooty(scrapy.Spider):
name = "scooty"

def start_requests(self):
filters = ["scooty", "scooter","scoter","scotty"]
linkUrl = 'http://hamrobazaar.com/search.php?do_search=Search&searchword={0}&Search.x=0&Search.y=0&catid_search=0'

for i in range(0, len(filters)):
url = linkUrl.replace('{0}',filters[i])
yield Request(url, self.parse)

def parse(self, response):
print(response.url)
print('~~~~~~~~~~~~begin----------------------------')

for sel in response.xpath('//td[@bgcolor="#ECF0F6"]/a'):
aLink = Selector(text=sel.extract()).xpath('//@href').extract_first()
if 'useritems' not in aLink:
url = response.urljoin(aLink)
yield Request(url, self.parseAdLink)

for sel in response.xpath('//td[@bgcolor="#F2F4F9"]/a'):
aLink = Selector(text=sel.extract()).xpath('//@href').extract_first()
if 'useritems' not in aLink:
url = response.urljoin(aLink)
yield Request(url, self.parseAdLink)

nextLink = response.xpath('//u[contains(text(),"Next")]')
nextAlink = nextLink.xpath('../../@href').extract_first()
fullNextLink = response.urljoin(nextAlink)
yield Request(fullNextLink, self.parse)

def parseAdLink(self, response):
item = HbScrapeItem()
title = response.xpath('//span[@class="title"]//text()').extract()
item['adTitle'] = ''.join(title)

adPostDateLabel = response.xpath('//td[contains(text(),"Ad Post Date:")]')
item['adPostDate'] = adPostDateLabel.xpath('../td[2]/text()').extract_first()

adViewsLabel = response.xpath('//td[contains(text(),"Ad Views:")]')
item['adViewsCount'] = adViewsLabel.xpath('../td[2]/text()').extract_first()

sellerLabel = response.xpath('//td[contains(text(),"Sold by:")]')
item['seller'] = sellerLabel.xpath('../td[2]/text()').extract_first()

sellerPhoneLabel = response.xpath('//td[contains(text(),"Mobile Phone:")]')
item['sellerPhone'] = sellerPhoneLabel.xpath('../td[2]/text()').extract_first()

sellerAddressLabel = response.xpath('//td[contains(text(),"Location:")]')
address = sellerAddressLabel.xpath('../td[2]/text()').extract()
item['address'] = ' '.join(address)

priceLabel = response.xpath('//td[contains(text(),"Price:")]')
item['price'] = priceLabel.xpath('../td[2]//text()').extract_first()

makeYearLabel = response.xpath('//td[contains(text(),"Make Year:")]')
item['makeYear'] = makeYearLabel.xpath('../td[2]/text()').extract_first()

lotNumLabel = response.xpath('//td[contains(text(),"Lot No:")]')
item['lotNumber'] = lotNumLabel.xpath('../td[2]/text()').extract_first()

featuresLabel = response.xpath('//td[contains(text(),"Features:")]')
item['features'] = featuresLabel.xpath('../td[2]/text()').extract_first()

item['adUrl'] = response.url

yield item