Selenium Based Scraping with .NET

Script that I wrote when for scraping some agents info for a project from Life Happens.

Scraping was done for different zip codes.

Used C# .NET and Selenium on the project.


public class LifeAgentModel
{
[JsonIgnore]
public string name { get; set; }
public string zipCode { get; set; }
public string detail { get; set; }
public string company { get; set; }
public string phone { get; set; }
public string address { get; set; }
public string firstname { get; set; }
public string lastname { get; set; }
public string middlename { get; set; }
}

 

public static class WebDriverExtensions
{
public static IWebElement FindElement(this IWebDriver driver, By by, int timeoutInSeconds)
{
if (timeoutInSeconds > 0)
{
var wait = new WebDriverWait(driver, TimeSpan.FromSeconds(timeoutInSeconds));
return wait.Until(drv => drv.FindElement(by));
}
return driver.FindElement(by);
}
}

public class LifeHappensScraper
{
static void ScrapeAndAddItems(ChromeDriver driver, string zipCode)
{
var lsResult = new List<LifeAgentModel>();
driver.Navigate().GoToUrl("https://lifehappens.org/agent-locator/");

var input = driver.FindElement(By.XPath("//form[@class='zip']/input[@name='zip']"));
input.Click();
input.SendKeys(zipCode);

var input2 = driver.FindElement(By.XPath("//form[@class='zip']/div/input[@id='a']"));
input2.Click();

var input3 = driver.FindElement(By.XPath("//form[@class='zip']/input[@name='SubmitZip']"));
input3.Click();

driver.Manage().Timeouts().ImplicitWait = TimeSpan.FromSeconds(3);

var loadMoreLink = driver.FindElement(By.XPath("//a[@class='load-more']"));//
loadMoreLink.Click();

var listResult = driver.FindElement(By.XPath("//div[@id='result-1']"), 1);//

var agents = listResult.FindElements(By.XPath("//dt"));

var names = listResult.FindElements(By.XPath("//dt/div[@class='agent-info']/h3[@class='name']"));
var details = listResult.FindElements(By.XPath("//dt/div[@class='agent-info']/h3[@class='name']/span"));
var companies = listResult.FindElements(By.XPath("//dt/div[@class='agent-info']/span"));
var addresses = listResult.FindElements(By.XPath("//dt/a"));
var phones = listResult.FindElements(By.XPath("//dt/div[@class='tel']"));

//Click Load More button to handle client side pagination
var totalLoadMore = names.Count / 10;
if (names.Count % 10 > 0)
{
totalLoadMore = totalLoadMore + 1;
}

for (var i = 0; i < totalLoadMore; i++)
{
loadMoreLink = driver.FindElement(By.XPath("//a[@class='load-more']"));//
loadMoreLink.Click();
driver.Manage().Timeouts().ImplicitWait = TimeSpan.FromSeconds(2);
}

for (int i = 0; i < names.Count; i++)
{
var model = new LifeAgentModel()
{
name = names[i].Text,
detail = i >= details.Count ? "" : details[i].Text,
company = companies[i].Text,
address = addresses[i].Text.Length < 2 ? "..." : addresses[i].Text,
phone = phones[i].Text,
zipCode = zipCode
};

if (model.name.Length > 0)
{
var fullNames = model.name.Split(" ");

model.firstname = fullNames[0];
model.lastname = fullNames[1];

if (fullNames.Length == 3)
{
model.middlename = fullNames[1];
model.lastname = fullNames[2];
}

lsResult.Add(model);
}
}
}

Scraping For Scooters From Hamrobazaar

Hamrobazaar.com is one of the most popular forum based e-commerce website from Nepal. One can find various items in this site. But finding the right item that is on sale, for example say finding a Scooter, it can be quite a challenging task because of the number of entries and variations.

So, here is a Scrapy script for your rescue!

 


import scrapy
import re
from hb_scrape.items import HbScrapeItem
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
from scrapy.http.request import Request
import json
import requests

class scooty(scrapy.Spider):
name = "scooty"

def start_requests(self):
filters = ["scooty", "scooter","scoter","scotty"]
linkUrl = 'http://hamrobazaar.com/search.php?do_search=Search&searchword={0}&Search.x=0&Search.y=0&catid_search=0'

for i in range(0, len(filters)):
url = linkUrl.replace('{0}',filters[i])
yield Request(url, self.parse)

def parse(self, response):
print(response.url)
print('~~~~~~~~~~~~begin----------------------------')

for sel in response.xpath('//td[@bgcolor="#ECF0F6"]/a'):
aLink = Selector(text=sel.extract()).xpath('//@href').extract_first()
if 'useritems' not in aLink:
url = response.urljoin(aLink)
yield Request(url, self.parseAdLink)

for sel in response.xpath('//td[@bgcolor="#F2F4F9"]/a'):
aLink = Selector(text=sel.extract()).xpath('//@href').extract_first()
if 'useritems' not in aLink:
url = response.urljoin(aLink)
yield Request(url, self.parseAdLink)

nextLink = response.xpath('//u[contains(text(),"Next")]')
nextAlink = nextLink.xpath('../../@href').extract_first()
fullNextLink = response.urljoin(nextAlink)
yield Request(fullNextLink, self.parse)

def parseAdLink(self, response):
item = HbScrapeItem()
title = response.xpath('//span[@class="title"]//text()').extract()
item['adTitle'] = ''.join(title)

adPostDateLabel = response.xpath('//td[contains(text(),"Ad Post Date:")]')
item['adPostDate'] = adPostDateLabel.xpath('../td[2]/text()').extract_first()

adViewsLabel = response.xpath('//td[contains(text(),"Ad Views:")]')
item['adViewsCount'] = adViewsLabel.xpath('../td[2]/text()').extract_first()

sellerLabel = response.xpath('//td[contains(text(),"Sold by:")]')
item['seller'] = sellerLabel.xpath('../td[2]/text()').extract_first()

sellerPhoneLabel = response.xpath('//td[contains(text(),"Mobile Phone:")]')
item['sellerPhone'] = sellerPhoneLabel.xpath('../td[2]/text()').extract_first()

sellerAddressLabel = response.xpath('//td[contains(text(),"Location:")]')
address = sellerAddressLabel.xpath('../td[2]/text()').extract()
item['address'] = ' '.join(address)

priceLabel = response.xpath('//td[contains(text(),"Price:")]')
item['price'] = priceLabel.xpath('../td[2]//text()').extract_first()

makeYearLabel = response.xpath('//td[contains(text(),"Make Year:")]')
item['makeYear'] = makeYearLabel.xpath('../td[2]/text()').extract_first()

lotNumLabel = response.xpath('//td[contains(text(),"Lot No:")]')
item['lotNumber'] = lotNumLabel.xpath('../td[2]/text()').extract_first()

featuresLabel = response.xpath('//td[contains(text(),"Features:")]')
item['features'] = featuresLabel.xpath('../td[2]/text()').extract_first()

item['adUrl'] = response.url

yield item