84 3
84 3
import random
import re
import os
import string
import scrapy
import pandas as pd
from datetime import datetime
from scrapy.cmdline import execute
# Spider class for scraping violation data from Good Jobs First (CMS-related
entries)
class DojCivilSpider(scrapy.Spider):
name = "CMS"
custom_settings = {
"DOWNLOAD_HANDLERS": {
"http": "scrapy_impersonate.ImpersonateDownloadHandler",
"https": "scrapy_impersonate.ImpersonateDownloadHandler",
},
"TWISTED_REACTOR":
"twisted.internet.asyncioreactor.AsyncioSelectorReactor",
}
def __init__(self):
# Store all parsed data in a list for final export
self.data = []
self.raw_data=[]
self.data_cleaned = []
self.data_uncleaned = []
self.excel = os.path.join(os.getcwd(), "exports") # or specify your own
path
# Format date fields into YYYY-MM-DD format (only for cleaned data)
if 'Date' in key:
try:
date_obj = datetime.strptime(value, "%B %d, %Y")
value = date_obj.strftime("%Y-%m-%d")
except Exception:
pass
self.data.append(item)
self.raw_data.append(raw_item)
except Exception:
return entry, None
if __name__ == '__main__':
execute("scrapy crawl CMS".split())