This is a snippet of Python’s Scrapy code that I recently wrote to crawl and scrape job listings from a Nepal’s leading job listing site MeroJob. 

I am planning to use data scraped from this site to perform some data analysis on the current job market trend of Nepal.

def parseJobLink(self, response):

        item = MerojobItem()

        item['employer'] = response.xpath('//span[@itemprop="name"]/text()').extract_first()

        employerDesc = response.xpath('//div[@itemprop="description"]//text()').extract()

        item['employerDesc'] = response.xpath('//div[@itemprop="description"]//text()').extract_first().encode('ascii', 'ignore').decode('ascii').strip()

        employerDesc = response.xpath('//div[@itemprop="description"]//text()').extract()

        item['employerDesc'] = "".join(x.encode('ascii', 'ignore').decode('ascii').strip() for x in employerDesc)

        item['jobTitle'] = response.xpath('//h1[@itemprop="title"]/text()').extract_first()

        item['jobViews'] = response.xpath('//span[@class="text-success"]/text()').extract_first()

        item['jobViewsCountOn'] = str(date.today())




        jobInfo = response.xpath('//table[@class="table table-hover table-no-border m-0"][1]')

        item['jobLink'] = response.url

        text = jobInfo.xpath('//tr/td[2]//text()').extract()

        itemText = response.xpath('//table[@class="table table-hover table-no-border m-0"][1]/tr/td[2]//text()').extract()

        if itemText is not None:

            str_list = [x.encode('ascii', 'ignore').decode('ascii').strip() for x in itemText if len(x.encode('ascii', 'ignore').decode('ascii').strip())>1]




            label = response.xpath('//table[@class="table table-hover table-no-border m-0"][1]/tr/td[1]//text()').extract()

            str_list_label = [x.encode('ascii', 'ignore').decode('ascii').strip() for x in label if len(x.encode('ascii', 'ignore').decode('ascii').strip())>1]




            numberOfVacancy = jobInfo.xpath('//tr[3]/td[2]/strong/text()').extract_first()

            item['numberOfVacancy'] = numberOfVacancy

            str_list_label = list(filter(lambda x : x != 'No. of Vacancy/s', str_list_label))

            str_list_label = list(filter(lambda x : x != 'Apply Before', str_list_label))




            for index, label in enumerate(str_list_label):

                if 'Category' in label:

                    item['jobCategory'] = str_list[index]

                if 'Job Level' in label:

                    item['jobLevel'] = str_list[index]

                if 'Employment Type' in label:

                    item['employementType'] = str_list[index]  

                if 'Salary' in label:

                    item['salary'] = str_list[index] 

                if 'Deadline' in label:

                    item['applicationDeadline'] = str_list[index][:20].strip()

                if 'Education Level' in label:

                    item['educationLevel'] = str_list[index]

                if 'Experience' in label:

                    item['experienceRequired'] = str_list[index]




        yield item
Last modified: September 18, 2018

Author