0% found this document useful (0 votes)
0 views10 pages

84 3

The document outlines a Scrapy spider class named DojCivilSpider designed to scrape violation data related to CMS from the Good Jobs First website. It includes configurations for session management, headers, and methods for handling requests and parsing data from the website. The spider is set up to manage pagination and extract specific details from company profiles, storing both raw and cleaned data for export.

Uploaded by

dhruvchavda447
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
0 views10 pages

84 3

The document outlines a Scrapy spider class named DojCivilSpider designed to scrape violation data related to CMS from the Good Jobs First website. It includes configurations for session management, headers, and methods for handling requests and parsing data from the website. The spider is set up to manage pagination and extract specific details from company profiles, storing both raw and cleaned data for export.

Uploaded by

dhruvchavda447
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 10

import hashlib

import random
import re
import os
import string
import scrapy
import pandas as pd
from datetime import datetime
from scrapy.cmdline import execute

# Spider class for scraping violation data from Good Jobs First (CMS-related
entries)
class DojCivilSpider(scrapy.Spider):
name = "CMS"

# Preset cookies required for session management and access


cookies = {
'PHPSESSID': '606e290aa776d7daa4d8b4612f6c7d0e',
'_gid': 'GA1.2.1900651794.1747137516',
'_fbp': 'fb.1.1747137517082.26241025820320942',
'_gat_UA-21812781-2': '1',
'_ga_9VW1HCFL7C': 'GS2.1.s1747141508$o2$g0$t1747141508$j0$l0$h0',
'_ga': 'GA1.1.1953708763.1747137516',
}
headers = {
'accept':
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/
webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'en-US,en;q=0.9',
'cache-control': 'max-age=0',
'priority': 'u=0, i',
'referer': 'https://violationtracker.goodjobsfirst.org/?
company_op=starts&company=&offense_group=&agency_code=OFAC',
'sec-ch-ua': '"Chromium";v="136", "Google Chrome";v="136",
"Not.A/Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36',
# 'cookie': 'PHPSESSID=606e290aa776d7daa4d8b4612f6c7d0e;
_gid=GA1.2.1900651794.1747137516; _fbp=fb.1.1747137517082.26241025820320942;
_gat_UA-21812781-2=1; _ga_9VW1HCFL7C=GS2.1.s1747141508$o2$g0$t1747141508$j0$l0$h0;
_ga=GA1.1.1953708763.1747137516',
}

custom_settings = {
"DOWNLOAD_HANDLERS": {
"http": "scrapy_impersonate.ImpersonateDownloadHandler",
"https": "scrapy_impersonate.ImpersonateDownloadHandler",
},
"TWISTED_REACTOR":
"twisted.internet.asyncioreactor.AsyncioSelectorReactor",
}
def __init__(self):
# Store all parsed data in a list for final export
self.data = []
self.raw_data=[]
self.data_cleaned = []
self.data_uncleaned = []
self.excel = os.path.join(os.getcwd(), "exports") # or specify your own
path

# Create the directory if it doesn't exist


os.makedirs(self.excel, exist_ok=True)
self.url = "https://violationtracker.goodjobsfirst.org/?
company_op=starts&company=&offense_group=&agency_code=OFAC"
def start_requests(self):
browsers = [
"chrome110",
"edge99",
"safari15_5"
]
meta = {}
meta['impersonate'] = random.choice(browsers)
# Send initial request with custom headers and cookies
yield scrapy.Request(url=self.url,meta=meta, headers=self.headers,
cookies=self.cookies, callback=self.parse_listing)

def parse_listing(self, response):


company_links =
response.xpath('//table[2]/tbody/tr/td[1]/a/@href').getall()

for link in company_links:


browsers = [
"chrome110",
"edge99",
"safari15_5"
]
meta = {}
meta['impersonate'] = random.choice(browsers)
yield response.follow(link,meta=meta, headers=self.headers,
cookies=self.cookies, callback=self.parse_details)
# break # Use during testing to limit pagination

# Handle pagination by identifying total pages and iterating through all


next_page = response.xpath('//a[contains(text(),">>")]/@href').get()
if next_page:
total_page = int(next_page.split('page=')[-1])
for page in range(0, total_page+1):
paginated_url = f"{self.url}&page={page}"
browsers = [
"chrome110",
"edge99",
"safari15_5"
]
meta = {}
meta['impersonate'] = random.choice(browsers)
yield scrapy.Request(url=paginated_url,meta=meta,
headers=self.headers, cookies=self.cookies, callback=self.parse_listing)
# def parse_details(self, response):
# # Extract all key-value details from the company profile page
# item = dict()
# raw_item = dict() # For storing completely raw data
#
# item["PDP URL"] = response.url
# raw_item["PDP URL"] = response.url
#
# for path in response.xpath('//div[@id="contentResult"]//b'):
# data_skip = False
# key = path.xpath('./text()').get(default='NA').strip()
# raw_value = \
# ''.join(path.xpath('./following-sibling::text() | ./following-
sibling::a//text()').getall()).split('\n:')[
# 0].replace(':\xa0', '').strip()
# value = raw_value # Initialize cleaned value with raw value
#
# # Store raw value first
# raw_item[key] = raw_value
# # Format date fields into YYYY-MM-DD format (only for cleaned data)
# if 'Date' in key:
# try:
# date_obj = datetime.strptime(value, "%B %d, %Y")
# value = date_obj.strftime("%Y-%m-%d")
# except Exception:
# pass
#
# # Clean up punctuation from 'Company' names (only for cleaned data)
# if key == 'Company':
# text_no_punct = value.translate(str.maketrans(string.punctuation,
' ' * len(string.punctuation)))
# value = re.sub(r'\s+', ' ', text_no_punct).strip()
#
# # Handle special keys that include hyperlinks or additional metadata
# if 'Mega-Scandal' in key or 'Source of Data' in key or 'Current
Parent Company' in key:
# if 'Mega-Scandal' in key:
# item[key + " name"] = value
# item[key + " url"] =
path.xpath('./following-sibling::a/@href').get(default='')
# raw_item[key + " name"] = raw_value
# raw_item[key + " url"] =
path.xpath('./following-sibling::a/@href').get(default='')
# elif 'Current Parent Company' in key:
# clean_name =
value.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
# item[key + " name"] = clean_name
# item[key + " url"] =
path.xpath('./following-sibling::a/@href').get(default='')
# raw_item[key + " name"] = raw_value # Keep raw value
# raw_item[key + " url"] =
path.xpath('./following-sibling::a/@href').get(default='')
# else:
# value =
path.xpath('./following-sibling::a/@href').get(default='')
# raw_item[key] = value # URL is the same in both
# data_skip = True
#
# # Handle link-based values (e.g., "click here")
# if not data_skip:
# if "(click here)" in value.lower():
# value =
path.xpath('./following-sibling::a/@href').get(default='')
# raw_item[key] = value # URL is the same in both
# item[key] = value
#
# # Additional handling for specific URL columns
# url_columns = ['Current Parent Company url', 'Archived Source', 'Mega-
Scandal url']
# for url_col in url_columns:
# if url_col in item and isinstance(item[url_col], str) and "(click
here)" in item[url_col].lower():
# url_value = response.xpath(
# f'//b[contains(text(), "{url_col.replace(" url",
"")}")]/following-sibling::a/@href').get(
# default='')
# item[url_col] = url_value
# raw_item[url_col] = url_value
#
# self.data.append(item)
# self.raw_data.append(raw_item) # Store raw data separately
# def parse_details(self, response):
# # Extract all key-value details from the company profile page
# item = dict()
# raw_item = dict() # For storing completely raw data
#
# item["PDP URL"] = response.url
# raw_item["PDP URL"] = response.url
#
# for path in response.xpath('//div[@id="contentResult"]//b'):
# data_skip = False
# key = path.xpath('./text()').get(default='NA').strip()
# raw_value = \
# ''.join(path.xpath('./following-sibling::text() | ./following-
sibling::a//text()').getall()).split(
# '\n:')[
# 0].replace(':\xa0', '').strip()
# value = raw_value # Initialize cleaned value with raw value
#
# # Store raw value first
# raw_item[key] = raw_value
#
# # Format date fields into YYYY-MM-DD format (only for cleaned data)
# if 'Date' in key:
# try:
# date_obj = datetime.strptime(value, "%B %d, %Y")
# value = date_obj.strftime("%Y-%m-%d")
# except Exception:
# pass
#
# # Special handling for Company field only
# if key == 'Company':
# # First extract country if present
# name, country = self.extract_name_alias(value)
#
# # Extract alias (content in parentheses)
# alias = None
# if '(' in value and ')' in value:
# alias_match = re.search(r'\((.*?)\)', value)
# if alias_match:
# alias = alias_match.group(1)
# value = re.sub(r'\(.*?\)', '', value).strip()
#
# # Clean up punctuation (keeping - ' & /)
# text_no_punct = value.translate(str.maketrans('', '',
string.punctuation.replace('-', '').replace("'",
#
'').replace(
# '&', '').replace('/', '')))
# value = re.sub(r'\s+', ' ', text_no_punct).strip()
#
# # Store the processed values
# item[key] = value
# if alias:
# item['Company Alias'] = alias
# if country:
# item['Country'] = country
# continue # Skip further processing for Company field
#
# # Handle Current Parent Company (similar to Company but without
alias/country extraction)
# if key == 'Current Parent Company':
# # Clean up punctuation (keeping - ' & /)
# text_no_punct = value.translate(str.maketrans('', '',
string.punctuation.replace('-', '').replace("'",
#
'').replace(
# '&', '').replace('/', '')))
# value = re.sub(r'\s+', ' ', text_no_punct).strip()
# item[key + " name"] = value
# item[key + " url"] =
path.xpath('./following-sibling::a/@href').get(default='')
# raw_item[key + " name"] = raw_value
# raw_item[key + " url"] =
path.xpath('./following-sibling::a/@href').get(default='')
# continue
#
# # Handle other special keys (Mega-Scandal, Source of Data) without
any punctuation changes
# if 'Mega-Scandal' in key or 'Source of Data' in key:
# if 'Mega-Scandal' in key:
# item[key + " name"] = value
# item[key + " url"] =
path.xpath('./following-sibling::a/@href').get(default='')
# raw_item[key + " name"] = raw_value
# raw_item[key + " url"] =
path.xpath('./following-sibling::a/@href').get(default='')
# else:
# value =
path.xpath('./following-sibling::a/@href').get(default='')
# raw_item[key] = value
# data_skip = True
#
# # Handle link-based values (e.g., "click here")
# if not data_skip:
# if "(click here)" in value.lower():
# value =
path.xpath('./following-sibling::a/@href').get(default='')
# raw_item[key] = value
# item[key] = value # All other fields keep original punctuation
#
# # Additional handling for specific URL columns
# url_columns = ['Current Parent Company url', 'Archived Source', 'Mega-
Scandal url']
# for url_col in url_columns:
# if url_col in item and isinstance(item[url_col], str) and "(click
here)" in item[url_col].lower():
# url_value = response.xpath(
# f'//b[contains(text(), "{url_col.replace(" url",
"")}")]/following-sibling::a/@href').get(
# default='')
# item[url_col] = url_value
# raw_item[url_col] = url_value
#
# self.data.append(item)
# self.raw_data.append(raw_item)
#
# def extract_name_alias(self, entry):
# """
# Extract country from a given company name string.
# Returns a tuple: (name, country)
# """
# try:
# if not isinstance(entry, str):
# return entry, None
#
# entry = entry.strip()
# country = None
#
# # Known countries
# country_list = ['India', 'USA', 'United States', 'UK', 'Canada',
'Australia',
# 'Singapore', 'Germany', 'France','Hong Kong']
#
# # Extract and remove country from parentheses
# parens = re.findall(r'\(([^()]+)\)', entry)
# for val in parens:
# if val.strip() in country_list:
# country = val.strip()
# entry = re.sub(r'\(' + re.escape(val) + r'\)', '',
entry).strip()
#
# return entry, country
#
# except Exception:
# return entry, None
def parse_details(self, response):
# Known countries list
self.country_list = ['India', 'USA', 'United States', 'UK', 'Canada',
'Australia',
'Singapore', 'Germany', 'France', 'Hong Kong']

# Extract all key-value details from the company profile page


item = dict()
raw_item = dict() # For storing completely raw data

item["PDP URL"] = response.url


raw_item["PDP URL"] = response.url
for path in response.xpath('//div[@id="contentResult"]//b'):
data_skip = False
key = path.xpath('./text()').get(default='NA').strip()
raw_value = \
''.join(path.xpath('./following-sibling::text() | ./following-
sibling::a//text()').getall()).split(
'\n:')[0].replace(':\xa0', '').strip()
value = raw_value # Initialize cleaned value with raw value

# Store raw value first


raw_item[key] = raw_value

# Format date fields into YYYY-MM-DD format (only for cleaned data)
if 'Date' in key:
try:
date_obj = datetime.strptime(value, "%B %d, %Y")
value = date_obj.strftime("%Y-%m-%d")
except Exception:
pass

# Special handling for Company field


if key == 'Company':
name, country = self.extract_name_alias(value)

# Extract alias only if not a known country


alias = None
parens = re.findall(r'\(([^()]+)\)', value)
for val in parens:
if val.strip() not in self.country_list:
alias = val.strip()
break

# Remove all parentheses content from value


value = re.sub(r'\(.*?\)', '', value).strip()

# Clean up punctuation (keeping - ' & /)


text_no_punct = value.translate(str.maketrans('', '',
string.punctuation.replace('-', '')
.replace("'", '')
.replace('&', '')
.replace('/', '')))
value = re.sub(r'\s+', ' ', text_no_punct).strip()

# Store the processed values


item[key] = value
if alias:
item['Company Alias'] = alias
if country:
item['Country'] = country
continue # Skip further processing for Company field

# Handle Current Parent Company (similar to Company but without


alias/country extraction)
if key == 'Current Parent Company':
# Clean up punctuation (keeping - ' & /)
text_no_punct = value.translate(str.maketrans('', '',
string.punctuation.replace('-', '')
.replace("'", '')
.replace('&', '')
.replace('/', '')))
value = re.sub(r'\s+', ' ', text_no_punct).strip()
item[key + " name"] = value
item[key + " url"] =
path.xpath('./following-sibling::a/@href').get(default='')
raw_item[key + " name"] = raw_value
raw_item[key + " url"] =
path.xpath('./following-sibling::a/@href').get(default='')
continue

# Handle special keys (Mega-Scandal, Source of Data)


if 'Mega-Scandal' in key or 'Source of Data' in key:
if 'Mega-Scandal' in key:
item[key + " name"] = value
item[key + " url"] =
path.xpath('./following-sibling::a/@href').get(default='')
raw_item[key + " name"] = raw_value
raw_item[key + " url"] =
path.xpath('./following-sibling::a/@href').get(default='')
else:
value =
path.xpath('./following-sibling::a/@href').get(default='')
raw_item[key] = value
data_skip = True

# Handle link-based values (e.g., "click here")


if not data_skip:
if "(click here)" in value.lower():
value =
path.xpath('./following-sibling::a/@href').get(default='')
raw_item[key] = value
item[key] = value # All other fields keep original punctuation

# Additional handling for specific URL columns


url_columns = ['Current Parent Company url', 'Archived Source', 'Mega-
Scandal url']
for url_col in url_columns:
if url_col in item and isinstance(item[url_col], str) and "(click
here)" in item[url_col].lower():
url_value = response.xpath(
f'//b[contains(text(), "{url_col.replace(" url",
"")}")]/following-sibling::a/@href').get(
default='')
item[url_col] = url_value
raw_item[url_col] = url_value

self.data.append(item)
self.raw_data.append(raw_item)

def extract_name_alias(self, entry):


"""
Extract country from a given company name string.
Returns a tuple: (name, country)
"""
try:
if not isinstance(entry, str):
return entry, None
entry = entry.strip()
country = None

# Extract and remove country from parentheses


parens = re.findall(r'\(([^()]+)\)', entry)
for val in parens:
if val.strip() in self.country_list:
country = val.strip()
entry = re.sub(r'\(' + re.escape(val) + r'\)', '',
entry).strip()

return entry, country

except Exception:
return entry, None

def close(self, reason):


# Create DataFrames
df_cleaned = pd.DataFrame(self.data)
df_uncleaned = pd.DataFrame(self.raw_data) # Use completely raw data

# Add common columns to both DataFrames


for df in [df_cleaned]: # Only iterate over df_cleaned
df.insert(0, 'ID', range(1, 1 + len(df)))
df.insert(1, 'Source URL',
'https://violationtracker.goodjobsfirst.org/?
company_op=starts&company=&offense_group=&agency_code=OFAC')

# Only add ID to uncleaned data (without Source URL)


df_uncleaned.insert(0, 'ID', range(1, 1 + len(df_uncleaned)))
df_uncleaned = df_uncleaned.drop(columns=['Current Parent Company
url','Source of Data'])
df = df.rename(columns={
'Current Parent Company name': 'Current Parent Company'
})
# Save uncleaned file (completely raw data)
timestamp = datetime.now().strftime('%Y-%m-%d_%H%M%S')
filename_uncleaned =
f"row_data_violationtracker_goodjobsfirst_org_{timestamp}.xlsx"
filepath_uncleaned = os.path.join(self.excel, filename_uncleaned)

with pd.ExcelWriter(filepath_uncleaned, engine='xlsxwriter',


engine_kwargs={'options': {'strings_to_numbers':
True}}) as writer:
df_uncleaned.fillna("").to_excel(writer, index=False)

# Process cleaned data


protected_columns = ['ID', 'Source URL', 'PDP URL', 'Penalty', 'Date',
'Current Parent Company url', 'Archived Source',
'Mega-Scandal url']

# 1. Replace N/A with blank except in protected columns


for col in df_cleaned.columns:
if col not in protected_columns:
df_cleaned[col] = df_cleaned[col].replace(['N/A', 'NA'], '')

# 2. Remove punctuation except &-/'


def clean_text(text):
if not isinstance(text, str) or text.strip() == '':
return text
keep_chars = {"&", "-", "/", "'"}
cleaned = []
for char in text:
if char in string.punctuation and char not in keep_chars:
cleaned.append(' ')
else:
cleaned.append(char)
return re.sub(r'\s+', ' ', ''.join(cleaned)).strip()

# Apply cleaning to non-protected string columns


for col in df_cleaned.columns:
if col not in protected_columns and df_cleaned[col].dtype == object:
df_cleaned[col] = df_cleaned[col].apply(clean_text)

# Save cleaned file


filename_cleaned = f"violationtracker_goodjobsfirst_org_{timestamp}.xlsx"
filepath_cleaned = os.path.join(self.excel, filename_cleaned)

with pd.ExcelWriter(filepath_cleaned, engine='xlsxwriter',


engine_kwargs={'options': {'strings_to_numbers':
True}}) as writer:
df_cleaned.to_excel(writer, index=False)

if __name__ == '__main__':
execute("scrapy crawl CMS".split())

You might also like