import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin, urlparse
import concurrent.futures
import time
import logging
# Set up logging to capture potential issues.
logging.basicConfig(level=logging.WARNING, format='%(asctime)s - %(levelname)s - %
(message)s')
def find_keywords_on_page(url, keywords, case_sensitive=False):
"""Checks if keywords are present on a single web page and returns found
keywords with URL.
Returns a dictionary of found keywords and their URLs. Returns None on
error."""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
'Connection': 'keep-alive',
}
try:
session = requests.Session()
response = session.get(url, timeout=10, headers=headers)
response.raise_for_status()
response.encoding = response.apparent_encoding
try:
soup = BeautifulSoup(response.content, 'html.parser',
from_encoding=response.encoding)
except Exception as e:
logging.error(f"Error parsing HTML with BeautifulSoup for {url}: {e}")
session.close()
return None
# Check for language metadata here as well
if soup.find('html'):
lang = soup.find('html').get('lang', 'en')
if not lang.startswith('en'):
session.close()
return None
text = soup.get_text(separator=' ', strip=True).lower() if not
case_sensitive else soup.get_text(separator=' ', strip=True)
found_keywords = {}
for keyword in keywords:
search_term = keyword.lower() if not case_sensitive else keyword
if re.search(r'\b' + re.escape(search_term) + r'\b', text):
found_keywords[keyword] = url
session.close()
return found_keywords
except requests.exceptions.RequestException:
return None
except Exception:
return None
def crawl_and_search(start_url, keywords, session, timeout=300):
"""Crawls a website and searches for keywords, stopping if any keyword appears
in the URL.
Returns a dictionary of found keywords and their URLs. Returns None on error.
Uses a shared session."""
visited = set()
queue = [start_url]
found_matches = {}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
'Connection': 'keep-alive',
}
irrelevant_words = ["member", "provider"]
start_time = time.time()
current_domain = urlparse(start_url).netloc
while queue:
current_url = queue.pop(0)
if current_url in visited:
continue
visited.add(current_url)
# Check for timeout at the beginning of each iteration
if time.time() - start_time > timeout:
print(f"Crawling of {current_domain} stopped due to timeout (>
{timeout} seconds).")
return None
# Check if any keyword is in the URL itself
for keyword in keywords:
if keyword.lower() in current_url.lower() if not case_sensitive else
keyword in current_url:
#print(f"Match found in URL: {current_url}")
return {keyword: current_url}
# Skip URLs containing irrelevant words
parsed_url = urlparse(current_url)
path_segments = [segment.lower() for segment in parsed_url.path.split('/')
if segment]
if any(word in path_segments for word in irrelevant_words):
#print(f"Skipping URL due to irrelevant words: {current_url}")
continue
try:
response = session.get(current_url, timeout=10, headers=headers)
response.raise_for_status()
response.encoding = response.apparent_encoding
try:
soup = BeautifulSoup(response.content, 'html.parser',
from_encoding=response.encoding)
except Exception as e:
logging.error(f"Error parsing HTML with BeautifulSoup for
{current_url}: {e}")
continue
# Check for language
if soup.find('html'):
lang = soup.find('html').get('lang', 'en')
if not lang.startswith('en'):
#print(f"Skipping non-English page: {current_url}")
continue # Skip non-English pages
links = [urljoin(current_url, link['href']) for link in
soup.find_all('a', href=True)]
page_keywords = find_keywords_on_page(current_url, keywords)
if page_keywords:
found_matches.update(page_keywords)
for absolute_url in links:
if absolute_url.startswith(start_url) and absolute_url not in
visited:
queue.append(absolute_url)
except requests.exceptions.HTTPError as e:
if e.response.status_code in [404, 403]:
#print(f"Skipping URL due to {e.response.status_code} error:
{current_url}")
continue # Skip this URL and continue with the next one
else:
#print(f"Error fetching URL {current_url}: {e}")
return None
except requests.exceptions.RequestException as e:
#print(f"Error fetching URL {current_url}: {e}")
return None
except Exception as e:
print(f"Error processing URL {current_url}: {e}")
pass
return found_matches
def process_domain(url, keywords, session, timeout=300):
"""Helper function to process a single domain.
Now takes a session and a timeout."""
start_time = time.time()
result = crawl_and_search(url, keywords, session, timeout)
end_time = time.time()
if result is None: # Check for None (timeout or other error)
return None
if end_time - start_time > timeout:
print(f"Crawling of {url} stopped due to timeout (> {timeout} seconds).")
return None
return result
def search_multiple_domains_from_file(filename, keywords, timeout=300):
"""Searches multiple domains from a text file for keywords using a thread
pool."""
try:
with open(filename, 'r') as file:
urls = [line.strip() for line in file if line.strip()]
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
session = requests.Session()
future_to_url = {executor.submit(process_domain, url, keywords,
session, timeout): url for url in urls}
for future in concurrent.futures.as_completed(future_to_url):
url = future_to_url[future]
try:
result = future.result()
if result:
print(f"\nSearching: {url}")
print(f"Found matches in {url}:")
for keyword, match_url in result.items():
print(f" Keyword: {keyword}, URL: {match_url}")
except Exception as e:
print(f"Error processing domain {url}: {e}")
pass
session.close()
except FileNotFoundError:
print(f"Error: File '{filename}' not found.")
if __name__ == "__main__":
domains_file = "domains.txt"
search_keywords = ["RFP", "Proposal", "Procurement", "Bid"]
case_sensitive = False
timeout_value = 300
start_time = time.time()
search_multiple_domains_from_file(domains_file, search_keywords,
timeout=timeout_value)
end_time = time.time()
print(f"Total time taken: {end_time - start_time:.2f} seconds")