Python Code
Python Code
/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Apr 28 11:35:04 2020
@author: chrislovejoy
"""
import urllib
import requests
from bs4 import BeautifulSoup
import selenium
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
import pandas as pd
import os
if website == 'Indeed':
job_soup = load_indeed_jobs_div(job_title, location)
jobs_list, num_listings = extract_job_information_indeed(job_soup,
desired_characs)
if website == 'CWjobs':
location_of_driver = os.getcwd()
driver = initiate_driver(location_of_driver, browser='chrome')
job_soup = make_job_search(job_title, location, driver)
jobs_list, num_listings = extract_job_information_cwjobs(job_soup,
desired_characs)
save_jobs_to_excel(jobs_list, filename)
cols = []
extracted_info = []
if 'titles' in desired_characs:
titles = []
cols.append('titles')
for job_elem in job_elems:
titles.append(extract_job_title_indeed(job_elem))
extracted_info.append(titles)
if 'companies' in desired_characs:
companies = []
cols.append('companies')
for job_elem in job_elems:
companies.append(extract_company_indeed(job_elem))
extracted_info.append(companies)
if 'links' in desired_characs:
links = []
cols.append('links')
for job_elem in job_elems:
links.append(extract_link_indeed(job_elem))
extracted_info.append(links)
if 'date_listed' in desired_characs:
dates = []
cols.append('date_listed')
for job_elem in job_elems:
dates.append(extract_date_indeed(job_elem))
extracted_info.append(dates)
jobs_list = {}
for j in range(len(cols)):
jobs_list[cols[j]] = extracted_info[j]
num_listings = len(extracted_info[0])
def extract_job_title_indeed(job_elem):
title_elem = job_elem.find('h2', class_='title')
title = title_elem.text.strip()
return title
def extract_company_indeed(job_elem):
company_elem = job_elem.find('span', class_='company')
company = company_elem.text.strip()
return company
def extract_link_indeed(job_elem):
link = job_elem.find('a')['href']
link = 'www.Indeed.co.uk/' + link
return link
def extract_date_indeed(job_elem):
date_elem = job_elem.find('span', class_='date')
date = date_elem.text.strip()
return date
driver.implicitly_wait(5)
page_source = driver.page_source
return job_soup
if 'titles' in desired_characs:
titles = []
cols.append('titles')
for job_elem in job_elems:
titles.append(extract_job_title_cwjobs(job_elem))
extracted_info.append(titles)
if 'companies' in desired_characs:
companies = []
cols.append('companies')
for job_elem in job_elems:
companies.append(extract_company_cwjobs(job_elem))
extracted_info.append(companies)
if 'links' in desired_characs:
links = []
cols.append('links')
for job_elem in job_elems:
links.append(extract_link_cwjobs(job_elem))
extracted_info.append(links)
if 'date_listed' in desired_characs:
dates = []
cols.append('date_listed')
for job_elem in job_elems:
dates.append(extract_date_cwjobs(job_elem))
extracted_info.append(dates)
jobs_list = {}
for j in range(len(cols)):
jobs_list[cols[j]] = extracted_info[j]
num_listings = len(extracted_info[0])
def extract_job_title_cwjobs(job_elem):
title_elem = job_elem.find('h2')
title = title_elem.text.strip()
return title
def extract_company_cwjobs(job_elem):
company_elem = job_elem.find('h3')
company = company_elem.text.strip()
return company
def extract_link_cwjobs(job_elem):
link = job_elem.find('a')['href']
return link
def extract_date_cwjobs(job_elem):
link_elem = job_elem.find('li', class_='date-posted')
link = link_elem.text.strip()
return link