0% found this document useful (0 votes)

0 views10 pages

84 3

The document outlines a Scrapy spider class named DojCivilSpider designed to scrape violation data related to CMS from the Good Jobs First website. It includes configurations for session management, headers, and methods for handling requests and parsing data from the website. The spider is set up to manage pagination and extract specific details from company profiles, storing both raw and cleaned data for export.

Uploaded by

dhruvchavda447

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

0 views10 pages

84 3

Uploaded by

dhruvchavda447

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

You are on page 1/ 10

import hashlib

import random
import re
import os
import string
import scrapy
import pandas as pd
from datetime import datetime
from scrapy.cmdline import execute

# Spider class for scraping violation data from Good Jobs First (CMS-related
entries)
class DojCivilSpider(scrapy.Spider):
name = "CMS"

# Preset cookies required for session management and access

cookies = {
'PHPSESSID': '606e290aa776d7daa4d8b4612f6c7d0e',
'_gid': 'GA1.2.1900651794.1747137516',
'_fbp': 'fb.1.1747137517082.26241025820320942',
'_gat_UA-21812781-2': '1',
'_ga_9VW1HCFL7C': 'GS2.1.s1747141508$o2$g0$t1747141508$j0$l0$h0',
'_ga': 'GA1.1.1953708763.1747137516',
}
headers = {
'accept':
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/
webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'en-US,en;q=0.9',
'cache-control': 'max-age=0',
'priority': 'u=0, i',
'referer': 'https://violationtracker.goodjobsfirst.org/?
company_op=starts&company=&offense_group=&agency_code=OFAC',
'sec-ch-ua': '"Chromium";v="136", "Google Chrome";v="136",
"Not.A/Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36',
# 'cookie': 'PHPSESSID=606e290aa776d7daa4d8b4612f6c7d0e;
_gid=GA1.2.1900651794.1747137516; _fbp=fb.1.1747137517082.26241025820320942;
_gat_UA-21812781-2=1; _ga_9VW1HCFL7C=GS2.1.s1747141508$o2$g0$t1747141508$j0$l0$h0;
_ga=GA1.1.1953708763.1747137516',
}

custom_settings = {
"DOWNLOAD_HANDLERS": {
"http": "scrapy_impersonate.ImpersonateDownloadHandler",
"https": "scrapy_impersonate.ImpersonateDownloadHandler",
},
"TWISTED_REACTOR":
"twisted.internet.asyncioreactor.AsyncioSelectorReactor",
}
def __init__(self):
# Store all parsed data in a list for final export
self.data = []
self.raw_data=[]
self.data_cleaned = []
self.data_uncleaned = []
self.excel = os.path.join(os.getcwd(), "exports") # or specify your own
path

# Create the directory if it doesn't exist

os.makedirs(self.excel, exist_ok=True)
self.url = "https://violationtracker.goodjobsfirst.org/?
company_op=starts&company=&offense_group=&agency_code=OFAC"
def start_requests(self):
browsers = [
"chrome110",
"edge99",
"safari15_5"
]
meta = {}
meta['impersonate'] = random.choice(browsers)
# Send initial request with custom headers and cookies
yield scrapy.Request(url=self.url,meta=meta, headers=self.headers,
cookies=self.cookies, callback=self.parse_listing)

def parse_listing(self, response):

company_links =
response.xpath('//table[2]/tbody/tr/td[1]/a/@href').getall()

for link in company_links:

browsers = [
"chrome110",
"edge99",
"safari15_5"
]
meta = {}
meta['impersonate'] = random.choice(browsers)
yield response.follow(link,meta=meta, headers=self.headers,
cookies=self.cookies, callback=self.parse_details)
# break # Use during testing to limit pagination

# Handle pagination by identifying total pages and iterating through all

next_page = response.xpath('//a[contains(text(),">>")]/@href').get()
if next_page:
total_page = int(next_page.split('page=')[-1])
for page in range(0, total_page+1):
paginated_url = f"{self.url}&page={page}"
browsers = [
"chrome110",
"edge99",
"safari15_5"
]
meta = {}
meta['impersonate'] = random.choice(browsers)
yield scrapy.Request(url=paginated_url,meta=meta,
headers=self.headers, cookies=self.cookies, callback=self.parse_listing)
# def parse_details(self, response):
# # Extract all key-value details from the company profile page
# item = dict()
# raw_item = dict() # For storing completely raw data
#
# item["PDP URL"] = response.url
# raw_item["PDP URL"] = response.url
#
# for path in response.xpath('//div[@id="contentResult"]//b'):
# data_skip = False
# key = path.xpath('./text()').get(default='NA').strip()
# raw_value = \
# ''.join(path.xpath('./following-sibling::text() | ./following-
sibling::a//text()').getall()).split('\n:')[
# 0].replace(':\xa0', '').strip()
# value = raw_value # Initialize cleaned value with raw value
#
# # Store raw value first
# raw_item[key] = raw_value
# # Format date fields into YYYY-MM-DD format (only for cleaned data)
# if 'Date' in key:
# try:
# date_obj = datetime.strptime(value, "%B %d, %Y")
# value = date_obj.strftime("%Y-%m-%d")
# except Exception:
# pass
#
# # Clean up punctuation from 'Company' names (only for cleaned data)
# if key == 'Company':
# text_no_punct = value.translate(str.maketrans(string.punctuation,
' ' * len(string.punctuation)))
# value = re.sub(r'\s+', ' ', text_no_punct).strip()
#
# # Handle special keys that include hyperlinks or additional metadata
# if 'Mega-Scandal' in key or 'Source of Data' in key or 'Current
Parent Company' in key:
# if 'Mega-Scandal' in key:
# item[key + " name"] = value
# item[key + " url"] =
path.xpath('./following-sibling::a/@href').get(default='')
# raw_item[key + " name"] = raw_value
# raw_item[key + " url"] =
path.xpath('./following-sibling::a/@href').get(default='')
# elif 'Current Parent Company' in key:
# clean_name =
value.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
# item[key + " name"] = clean_name
# item[key + " url"] =
path.xpath('./following-sibling::a/@href').get(default='')
# raw_item[key + " name"] = raw_value # Keep raw value
# raw_item[key + " url"] =
path.xpath('./following-sibling::a/@href').get(default='')
# else:
# value =
path.xpath('./following-sibling::a/@href').get(default='')
# raw_item[key] = value # URL is the same in both
# data_skip = True
#
# # Handle link-based values (e.g., "click here")
# if not data_skip:
# if "(click here)" in value.lower():
# value =
path.xpath('./following-sibling::a/@href').get(default='')
# raw_item[key] = value # URL is the same in both
# item[key] = value
#
# # Additional handling for specific URL columns
# url_columns = ['Current Parent Company url', 'Archived Source', 'Mega-
Scandal url']
# for url_col in url_columns:
# if url_col in item and isinstance(item[url_col], str) and "(click
here)" in item[url_col].lower():
# url_value = response.xpath(
# f'//b[contains(text(), "{url_col.replace(" url",
"")}")]/following-sibling::a/@href').get(
# default='')
# item[url_col] = url_value
# raw_item[url_col] = url_value
#
# self.data.append(item)
# self.raw_data.append(raw_item) # Store raw data separately
# def parse_details(self, response):
# # Extract all key-value details from the company profile page
# item = dict()
# raw_item = dict() # For storing completely raw data
#
# item["PDP URL"] = response.url
# raw_item["PDP URL"] = response.url
#
# for path in response.xpath('//div[@id="contentResult"]//b'):
# data_skip = False
# key = path.xpath('./text()').get(default='NA').strip()
# raw_value = \
# ''.join(path.xpath('./following-sibling::text() | ./following-
sibling::a//text()').getall()).split(
# '\n:')[
# 0].replace(':\xa0', '').strip()
# value = raw_value # Initialize cleaned value with raw value
#
# # Store raw value first
# raw_item[key] = raw_value
#
# # Format date fields into YYYY-MM-DD format (only for cleaned data)
# if 'Date' in key:
# try:
# date_obj = datetime.strptime(value, "%B %d, %Y")
# value = date_obj.strftime("%Y-%m-%d")
# except Exception:
# pass
#
# # Special handling for Company field only
# if key == 'Company':
# # First extract country if present
# name, country = self.extract_name_alias(value)
#
# # Extract alias (content in parentheses)
# alias = None
# if '(' in value and ')' in value:
# alias_match = re.search(r'$(.*?)$', value)
# if alias_match:
# alias = alias_match.group(1)
# value = re.sub(r'$.*?$', '', value).strip()
#
# # Clean up punctuation (keeping - ' & /)
# text_no_punct = value.translate(str.maketrans('', '',
string.punctuation.replace('-', '').replace("'",
#
'').replace(
# '&', '').replace('/', '')))
# value = re.sub(r'\s+', ' ', text_no_punct).strip()
#
# # Store the processed values
# item[key] = value
# if alias:
# item['Company Alias'] = alias
# if country:
# item['Country'] = country
# continue # Skip further processing for Company field
#
# # Handle Current Parent Company (similar to Company but without
alias/country extraction)
# if key == 'Current Parent Company':
# # Clean up punctuation (keeping - ' & /)
# text_no_punct = value.translate(str.maketrans('', '',
string.punctuation.replace('-', '').replace("'",
#
'').replace(
# '&', '').replace('/', '')))
# value = re.sub(r'\s+', ' ', text_no_punct).strip()
# item[key + " name"] = value
# item[key + " url"] =
path.xpath('./following-sibling::a/@href').get(default='')
# raw_item[key + " name"] = raw_value
# raw_item[key + " url"] =
path.xpath('./following-sibling::a/@href').get(default='')
# continue
#
# # Handle other special keys (Mega-Scandal, Source of Data) without
any punctuation changes
# if 'Mega-Scandal' in key or 'Source of Data' in key:
# if 'Mega-Scandal' in key:
# item[key + " name"] = value
# item[key + " url"] =
path.xpath('./following-sibling::a/@href').get(default='')
# raw_item[key + " name"] = raw_value
# raw_item[key + " url"] =
path.xpath('./following-sibling::a/@href').get(default='')
# else:
# value =
path.xpath('./following-sibling::a/@href').get(default='')
# raw_item[key] = value
# data_skip = True
#
# # Handle link-based values (e.g., "click here")
# if not data_skip:
# if "(click here)" in value.lower():
# value =
path.xpath('./following-sibling::a/@href').get(default='')
# raw_item[key] = value
# item[key] = value # All other fields keep original punctuation
#
# # Additional handling for specific URL columns
# url_columns = ['Current Parent Company url', 'Archived Source', 'Mega-
Scandal url']
# for url_col in url_columns:
# if url_col in item and isinstance(item[url_col], str) and "(click
here)" in item[url_col].lower():
# url_value = response.xpath(
# f'//b[contains(text(), "{url_col.replace(" url",
"")}")]/following-sibling::a/@href').get(
# default='')
# item[url_col] = url_value
# raw_item[url_col] = url_value
#
# self.data.append(item)
# self.raw_data.append(raw_item)
#
# def extract_name_alias(self, entry):
# """
# Extract country from a given company name string.
# Returns a tuple: (name, country)
# """
# try:
# if not isinstance(entry, str):
# return entry, None
#
# entry = entry.strip()
# country = None
#
# # Known countries
# country_list = ['India', 'USA', 'United States', 'UK', 'Canada',
'Australia',
# 'Singapore', 'Germany', 'France','Hong Kong']
#
# # Extract and remove country from parentheses
# parens = re.findall(r'$([^()]+)$', entry)
# for val in parens:
# if val.strip() in country_list:
# country = val.strip()
# entry = re.sub(r'$' + re.escape(val) + r'$', '',
entry).strip()
#
# return entry, country
#
# except Exception:
# return entry, None
def parse_details(self, response):
# Known countries list
self.country_list = ['India', 'USA', 'United States', 'UK', 'Canada',
'Australia',
'Singapore', 'Germany', 'France', 'Hong Kong']

# Extract all key-value details from the company profile page

item = dict()
raw_item = dict() # For storing completely raw data

item["PDP URL"] = response.url

raw_item["PDP URL"] = response.url
for path in response.xpath('//div[@id="contentResult"]//b'):
data_skip = False
key = path.xpath('./text()').get(default='NA').strip()
raw_value = \
''.join(path.xpath('./following-sibling::text() | ./following-
sibling::a//text()').getall()).split(
'\n:')[0].replace(':\xa0', '').strip()
value = raw_value # Initialize cleaned value with raw value

# Store raw value first

raw_item[key] = raw_value

# Format date fields into YYYY-MM-DD format (only for cleaned data)
if 'Date' in key:
try:
date_obj = datetime.strptime(value, "%B %d, %Y")
value = date_obj.strftime("%Y-%m-%d")
except Exception:
pass

# Special handling for Company field

if key == 'Company':
name, country = self.extract_name_alias(value)

# Extract alias only if not a known country

alias = None
parens = re.findall(r'$([^()]+)$', value)
for val in parens:
if val.strip() not in self.country_list:
alias = val.strip()
break

# Remove all parentheses content from value

value = re.sub(r'$.*?$', '', value).strip()

# Clean up punctuation (keeping - ' & /)

text_no_punct = value.translate(str.maketrans('', '',
string.punctuation.replace('-', '')
.replace("'", '')
.replace('&', '')
.replace('/', '')))
value = re.sub(r'\s+', ' ', text_no_punct).strip()

# Store the processed values

item[key] = value
if alias:
item['Company Alias'] = alias
if country:
item['Country'] = country
continue # Skip further processing for Company field

# Handle Current Parent Company (similar to Company but without

alias/country extraction)
if key == 'Current Parent Company':
# Clean up punctuation (keeping - ' & /)
text_no_punct = value.translate(str.maketrans('', '',
string.punctuation.replace('-', '')
.replace("'", '')
.replace('&', '')
.replace('/', '')))
value = re.sub(r'\s+', ' ', text_no_punct).strip()
item[key + " name"] = value
item[key + " url"] =
path.xpath('./following-sibling::a/@href').get(default='')
raw_item[key + " name"] = raw_value
raw_item[key + " url"] =
path.xpath('./following-sibling::a/@href').get(default='')
continue

# Handle special keys (Mega-Scandal, Source of Data)

if 'Mega-Scandal' in key or 'Source of Data' in key:
if 'Mega-Scandal' in key:
item[key + " name"] = value
item[key + " url"] =
path.xpath('./following-sibling::a/@href').get(default='')
raw_item[key + " name"] = raw_value
raw_item[key + " url"] =
path.xpath('./following-sibling::a/@href').get(default='')
else:
value =
path.xpath('./following-sibling::a/@href').get(default='')
raw_item[key] = value
data_skip = True

# Handle link-based values (e.g., "click here")

if not data_skip:
if "(click here)" in value.lower():
value =
path.xpath('./following-sibling::a/@href').get(default='')
raw_item[key] = value
item[key] = value # All other fields keep original punctuation

# Additional handling for specific URL columns

url_columns = ['Current Parent Company url', 'Archived Source', 'Mega-
Scandal url']
for url_col in url_columns:
if url_col in item and isinstance(item[url_col], str) and "(click
here)" in item[url_col].lower():
url_value = response.xpath(
f'//b[contains(text(), "{url_col.replace(" url",
"")}")]/following-sibling::a/@href').get(
default='')
item[url_col] = url_value
raw_item[url_col] = url_value

self.data.append(item)
self.raw_data.append(raw_item)

def extract_name_alias(self, entry):

"""
Extract country from a given company name string.
Returns a tuple: (name, country)
"""
try:
if not isinstance(entry, str):
return entry, None
entry = entry.strip()
country = None

# Extract and remove country from parentheses

parens = re.findall(r'$([^()]+)$', entry)
for val in parens:
if val.strip() in self.country_list:
country = val.strip()
entry = re.sub(r'$' + re.escape(val) + r'$', '',
entry).strip()

return entry, country

except Exception:
return entry, None

def close(self, reason):

# Create DataFrames
df_cleaned = pd.DataFrame(self.data)
df_uncleaned = pd.DataFrame(self.raw_data) # Use completely raw data

# Add common columns to both DataFrames

for df in [df_cleaned]: # Only iterate over df_cleaned
df.insert(0, 'ID', range(1, 1 + len(df)))
df.insert(1, 'Source URL',
'https://violationtracker.goodjobsfirst.org/?
company_op=starts&company=&offense_group=&agency_code=OFAC')

# Only add ID to uncleaned data (without Source URL)

df_uncleaned.insert(0, 'ID', range(1, 1 + len(df_uncleaned)))
df_uncleaned = df_uncleaned.drop(columns=['Current Parent Company
url','Source of Data'])
df = df.rename(columns={
'Current Parent Company name': 'Current Parent Company'
})
# Save uncleaned file (completely raw data)
timestamp = datetime.now().strftime('%Y-%m-%d_%H%M%S')
filename_uncleaned =
f"row_data_violationtracker_goodjobsfirst_org_{timestamp}.xlsx"
filepath_uncleaned = os.path.join(self.excel, filename_uncleaned)

with pd.ExcelWriter(filepath_uncleaned, engine='xlsxwriter',

engine_kwargs={'options': {'strings_to_numbers':
True}}) as writer:
df_uncleaned.fillna("").to_excel(writer, index=False)

# Process cleaned data

protected_columns = ['ID', 'Source URL', 'PDP URL', 'Penalty', 'Date',
'Current Parent Company url', 'Archived Source',
'Mega-Scandal url']

# 1. Replace N/A with blank except in protected columns

for col in df_cleaned.columns:
if col not in protected_columns:
df_cleaned[col] = df_cleaned[col].replace(['N/A', 'NA'], '')

# 2. Remove punctuation except &-/'

def clean_text(text):
if not isinstance(text, str) or text.strip() == '':
return text
keep_chars = {"&", "-", "/", "'"}
cleaned = []
for char in text:
if char in string.punctuation and char not in keep_chars:
cleaned.append(' ')
else:
cleaned.append(char)
return re.sub(r'\s+', ' ', ''.join(cleaned)).strip()

# Apply cleaning to non-protected string columns

for col in df_cleaned.columns:
if col not in protected_columns and df_cleaned[col].dtype == object:
df_cleaned[col] = df_cleaned[col].apply(clean_text)

# Save cleaned file

filename_cleaned = f"violationtracker_goodjobsfirst_org_{timestamp}.xlsx"
filepath_cleaned = os.path.join(self.excel, filename_cleaned)

with pd.ExcelWriter(filepath_cleaned, engine='xlsxwriter',

engine_kwargs={'options': {'strings_to_numbers':
True}}) as writer:
df_cleaned.to_excel(writer, index=False)

if __name__ == '__main__':
execute("scrapy crawl CMS".split())

Flipkart Web Scrapping
No ratings yet
Flipkart Web Scrapping
8 pages
87 1
No ratings yet
87 1
10 pages
Automation Cheat Sheet 2.0
100% (1)
Automation Cheat Sheet 2.0
6 pages
Web Scraping Project
No ratings yet
Web Scraping Project
1 page
Web Scraping Report
No ratings yet
Web Scraping Report
14 pages
Main 115
No ratings yet
Main 115
22 pages
Act 115 1
No ratings yet
Act 115 1
22 pages
84 Store
No ratings yet
84 Store
7 pages
111 Final
No ratings yet
111 Final
12 pages
Automation Cheat Sheet 2.0
100% (1)
Automation Cheat Sheet 2.0
6 pages
Final 057
No ratings yet
Final 057
8 pages
Main Code
No ratings yet
Main Code
10 pages
Links
No ratings yet
Links
24 pages
Phishing URL Detection
No ratings yet
Phishing URL Detection
242 pages
B - 2 CIE Web Scraping
No ratings yet
B - 2 CIE Web Scraping
8 pages
Project 2 EmailbySeleniumSameProject
No ratings yet
Project 2 EmailbySeleniumSameProject
16 pages
Dropdownlistscraping
No ratings yet
Dropdownlistscraping
7 pages
Hybrid Scraping Techniques
No ratings yet
Hybrid Scraping Techniques
8 pages
Scraperskank
No ratings yet
Scraperskank
3 pages
Assessment Task - Carbon38
No ratings yet
Assessment Task - Carbon38
5 pages
Project 3 INfiniteScrollBrewery
No ratings yet
Project 3 INfiniteScrollBrewery
11 pages
2025events Scraper
No ratings yet
2025events Scraper
5 pages
Chatgpt Code Chat Data
No ratings yet
Chatgpt Code Chat Data
32 pages
Extracting Code
No ratings yet
Extracting Code
4 pages
Web Scraping Code
No ratings yet
Web Scraping Code
4 pages
77 Main
No ratings yet
77 Main
17 pages
77 Main 2
No ratings yet
77 Main 2
13 pages
77 Final
No ratings yet
77 Final
24 pages
UI Ex 6 (61) - 1
No ratings yet
UI Ex 6 (61) - 1
3 pages
Real Estate Scraper
No ratings yet
Real Estate Scraper
23 pages
Data Gathering
No ratings yet
Data Gathering
7 pages
77 Main 3
No ratings yet
77 Main 3
13 pages
077 Main
No ratings yet
077 Main
13 pages
Scrapy Beginners Series Part 2 - Cleaning & Processing Data - ScrapeOps
No ratings yet
Scrapy Beginners Series Part 2 - Cleaning & Processing Data - ScrapeOps
10 pages
Dataset Join
No ratings yet
Dataset Join
12 pages
This Python Script Reads A JSON File
No ratings yet
This Python Script Reads A JSON File
3 pages
Pseudocodes and Flowcharts (Riyansha Shahare)
No ratings yet
Pseudocodes and Flowcharts (Riyansha Shahare)
14 pages
Millermatic 200 User Manual
No ratings yet
Millermatic 200 User Manual
72 pages
OEL01
No ratings yet
OEL01
8 pages
Black Hat SEO
100% (2)
Black Hat SEO
38 pages
Experiment2 Web Scraping and Data Analysis
No ratings yet
Experiment2 Web Scraping and Data Analysis
5 pages
Big Data Practicals
No ratings yet
Big Data Practicals
10 pages
Practical Web Scraping For Economists 1744341390
No ratings yet
Practical Web Scraping For Economists 1744341390
33 pages
Basic Scraping Techniques
No ratings yet
Basic Scraping Techniques
7 pages
Python Code
No ratings yet
Python Code
5 pages
Another Hack Test3
No ratings yet
Another Hack Test3
4 pages
Web Scraping and Data Collection CheatSheet 1731972399
No ratings yet
Web Scraping and Data Collection CheatSheet 1731972399
10 pages
Rohan Report
No ratings yet
Rohan Report
25 pages
Web Scrapping
No ratings yet
Web Scrapping
3 pages
ELT Using Pandas
No ratings yet
ELT Using Pandas
5 pages
This Is A PDF Extractor
No ratings yet
This Is A PDF Extractor
2 pages
Programming 2 Lectures
No ratings yet
Programming 2 Lectures
52 pages
Scrapeez
No ratings yet
Scrapeez
3 pages
DH
No ratings yet
DH
4 pages
Extract Transform Load
No ratings yet
Extract Transform Load
80 pages
Husqvarna 510 TC Workshop Manual 1989
No ratings yet
Husqvarna 510 TC Workshop Manual 1989
146 pages
Rapid ClickBank Profits
100% (2)
Rapid ClickBank Profits
62 pages
On Page Seo
100% (1)
On Page Seo
13 pages
Specialization Program - Full Detailed Main Brochure 90 Pages
No ratings yet
Specialization Program - Full Detailed Main Brochure 90 Pages
92 pages
2021 Profile Creation Sites
No ratings yet
2021 Profile Creation Sites
4 pages
2020's Best Web Scraping Tools For Data Extraction
No ratings yet
2020's Best Web Scraping Tools For Data Extraction
10 pages
SEO Course
No ratings yet
SEO Course
28 pages
Expired Domain Hunting
100% (1)
Expired Domain Hunting
4 pages
Seo Cheat Sheet PDF
No ratings yet
Seo Cheat Sheet PDF
2 pages
A2 - Digital Business Development
No ratings yet
A2 - Digital Business Development
9 pages
Title - The Beginner's Guide To SEO - Unveiling The Essentials For Digital Marketing Success
No ratings yet
Title - The Beginner's Guide To SEO - Unveiling The Essentials For Digital Marketing Success
15 pages
Document
No ratings yet
Document
29 pages
Manual de Mecánica Suzuki Vitara 1994
No ratings yet
Manual de Mecánica Suzuki Vitara 1994
835 pages
Intro To Python
No ratings yet
Intro To Python
10 pages
DLL Q 1 W 6 Epp Ict 5 2024
No ratings yet
DLL Q 1 W 6 Epp Ict 5 2024
7 pages
S.No. Vendor Id Vendor Nameprioritywatchlist Name Bs Country Source File Format Available
No ratings yet
S.No. Vendor Id Vendor Nameprioritywatchlist Name Bs Country Source File Format Available
23 pages
Project Proposal
No ratings yet
Project Proposal
14 pages
SEO Mix 6 Os Model and Categorization of Search e
No ratings yet
SEO Mix 6 Os Model and Categorization of Search e
16 pages
SEO - Batch - 2
No ratings yet
SEO - Batch - 2
5 pages
Dashboard: May 1, 2008 - May 31, 2008
No ratings yet
Dashboard: May 1, 2008 - May 31, 2008
6 pages
77 Chatgpt 1
No ratings yet
77 Chatgpt 1
4 pages
DS Unit 1 Essay Answers.
No ratings yet
DS Unit 1 Essay Answers.
18 pages
Digital Marketing - Yoga Board (MH)
No ratings yet
Digital Marketing - Yoga Board (MH)
7 pages
How Can Flipkart - Com Data Scraping Revolutionize Market Research Efforts
No ratings yet
How Can Flipkart - Com Data Scraping Revolutionize Market Research Efforts
7 pages
Youtube Titles: 1. Installation
No ratings yet
Youtube Titles: 1. Installation
13 pages
Week 6 On-Page SEO Worksheet
No ratings yet
Week 6 On-Page SEO Worksheet
4 pages
03 06
No ratings yet
03 06
1 page
Jl. Rancapacing - Google Maps
No ratings yet
Jl. Rancapacing - Google Maps
1 page
57 Changes Require
No ratings yet
57 Changes Require
1 page
Gov Process
No ratings yet
Gov Process
1 page
77 Changes Need
No ratings yet
77 Changes Need
1 page
57 Changes Require
No ratings yet
57 Changes Require
1 page
Introduction&Instruction To SEO
No ratings yet
Introduction&Instruction To SEO
10 pages
2019-09-09 Filed Opinion (DCKT)
No ratings yet
2019-09-09 Filed Opinion (DCKT)
38 pages
Local SEO Unlocked Taskmaster
No ratings yet
Local SEO Unlocked Taskmaster
6 pages

84 3

Uploaded by

84 3

Uploaded by

import hashlib

# Preset cookies required for session management and access

# Create the directory if it doesn't exist

def parse_listing(self, response):

for link in company_links:

# Handle pagination by identifying total pages and iterating through all

# Extract all key-value details from the company profile page

item["PDP URL"] = response.url

# Store raw value first

# Special handling for Company field

# Extract alias only if not a known country

# Remove all parentheses content from value

# Clean up punctuation (keeping - ' & /)

# Store the processed values

# Handle Current Parent Company (similar to Company but without

# Handle special keys (Mega-Scandal, Source of Data)

# Handle link-based values (e.g., "click here")

# Additional handling for specific URL columns

def extract_name_alias(self, entry):

# Extract and remove country from parentheses

return entry, country

def close(self, reason):

# Add common columns to both DataFrames

# Only add ID to uncleaned data (without Source URL)

with pd.ExcelWriter(filepath_uncleaned, engine='xlsxwriter',

# Process cleaned data

# 1. Replace N/A with blank except in protected columns

# 2. Remove punctuation except &-/'

# Apply cleaning to non-protected string columns

# Save cleaned file

with pd.ExcelWriter(filepath_cleaned, engine='xlsxwriter',

You might also like