another hack test3

import logging
import requests
import threading
from time import sleep
from random import choice
from bs4 import BeautifulSoup
from unidecode import unidecode
from urllib.parse import urlparse
from crosslinked.logger import Log
from datetime import datetime, timedelta
from urllib3 import disable_warnings, exceptions
disable_warnings(exceptions.InsecureRequestWarning)
logging.getLogger("urllib3").setLevel(logging.WARNING)
csv = logging.getLogger("cLinked_csv")
class Timer(threading.Thread):
def __init__(self, timeout):
threading.Thread.__init__(self)
self.start_time = None
self.running = None
self.timeout = timeout
def run(self):
self.running = True
self.start_time = datetime.now()
logging.debug("Thread Timer: Started")
while self.running:
if (datetime.now() - self.start_time) >
timedelta(seconds=self.timeout):
self.stop()
sleep(0.05)
def stop(self):
logging.debug("Thread Timer: Stopped")
self.running = False
class CrossLinked:
def __init__(
self, search_engine, target, timeout, conn_timeout=3, proxies=[], jitter=0
):
self.results = []
self.url = {
"google":
'https://www.google.com/search?q=site:linkedin.com/in+"{}"&num=100&start={}',
"bing":
'http://www.bing.com/search?q="{}"+site:linkedin.com/in&first={}',
}
self.runtime = datetime.now().strftime("%m-%d-%Y %H:%M:%S")

self.search_engine = search_engine
self.conn_timeout = conn_timeout
self.timeout = timeout
self.proxies = proxies
self.target = target
self.jitter = jitter
def search(self):
search_timer = Timer(self.timeout)
search_timer.start()
while search_timer.running:
try:
url = self.url[self.search_engine].format(
self.target, len(self.results)
)
resp = web_request(url, self.conn_timeout, self.proxies)
http_code = get_statuscode(resp)
if http_code != 200:
Log.info("{:<3} {} ({})".format(len(self.results), url,
http_code))
Log.warn("None 200 response, exiting search
({})".format(http_code))
break
self.page_parser(resp)
Log.info("{:<3} {} ({})".format(len(self.results), url, http_code))
sleep(self.jitter)
except KeyboardInterrupt:
Log.warn("Key event detected, exiting search...")
break
search_timer.stop()
return self.results
def page_parser(self, resp):

for link in extract_links(resp):
try:
self.results_handler(link)
except Exception as e:
Log.warn("Failed Parsing: {}- {}".format(link.get("href"), e))
def link_parser(self, url, link):

u = {"url": url}
u["text"] = unidecode(
link.text.split("|")[0].split("...")[0]
) # Capture link text before trailing chars
u["title"] = self.parse_linkedin_title(u["text"]) # Extract job title
u["name"] = self.parse_linkedin_name(u["text"]) # Extract whole name
return u
def parse_linkedin_title(self, data):

try:
title = data.split("-")[1].split("https:")[0]
return title.split("...")[0].split("|")[0].strip()
except:
return "N/A"
def parse_linkedin_name(self, data):

try:
name = data.split("-")[0].strip()
return unidecode(name)
except:
return False
def results_handler(self, link):

url = str(link.get("href")).lower()
if not extract_subdomain(url).endswith("linkedin.com"):
return False
elif "linkedin.com/in" not in url:
return False
data = self.link_parser(url, link)

self.log_results(data) if data["name"] else False
def log_results(self, d):

# Prevent Duplicates & non-standard responses (i.e:
"<span>linkedin.com</span></a>")
if d in self.results:
return
elif "linkedin.com" in d["name"]:
return
self.results.append(d)
# Search results are logged to names.csv but names.txt is not generated
until end to prevent duplicates
logging.debug("name: {:25} RawTxt: {}".format(d["name"], d["text"]))
csv.info(
'"{}","{}","{}","{}","{}","{}",'.format(
self.runtime,
self.search_engine,
d["name"],
d["title"],
d["url"],
d["text"],
)
)
def get_statuscode(resp):
try:
return resp.status_code
except:
return 0
def get_proxy(proxies):
tmp = choice(proxies) if proxies else False
return {"http": tmp, "https": tmp} if tmp else {}
def get_agent():
return choice(
[
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:104.0) Gecko/20100101
Firefox/104.0"
"Mozilla/5.0 (Macintosh; Intel Mac OS X 12.5; rv:104.0) Gecko/20100101
Firefox/104.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML,
like Gecko) Chrome/109.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML,
like Gecko) Chrome/108.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)
Chrome/108.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15
(KHTML, like Gecko) Version/16.1 Safari/605.1.15",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 13_1) AppleWebKit/605.1.15
(KHTML, like Gecko) Version/16.1 Safari/605.1.15",
]
)
def web_request(url, timeout=3, proxies=[], **kwargs):

try:
s = requests.Session()
r = requests.Request(
"GET",
url,
headers={"User-Agent": get_agent()},
cookies={"CONSENT": "YES"},
**kwargs
)
p = r.prepare()
return s.send(p, timeout=timeout, verify=False, proxies=get_proxy(proxies))
except requests.exceptions.TooManyRedirects as e:
Log.fail("Proxy Error: {}".format(e))
except:
pass
return False
def extract_links(resp):
links = []
soup = BeautifulSoup(resp.content, "lxml")
for link in soup.findAll("a"):
links.append(link)
return links
def extract_subdomain(url):
return urlparse(url).netloc

another hack test3

Uploaded by

Copyright:

Available Formats

another hack test3

Uploaded by

Document Information

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

another hack test3

Uploaded by

Copyright:

Available Formats

import logging

self.runtime = datetime.now().strftime("%m-%d-%Y %H:%M:%S")

def page_parser(self, resp):

def link_parser(self, url, link):

def parse_linkedin_title(self, data):

def parse_linkedin_name(self, data):

def results_handler(self, link):

data = self.link_parser(url, link)

def log_results(self, d):

def web_request(url, timeout=3, proxies=[], **kwargs):

You might also like