0% found this document useful (0 votes)
3 views4 pages

Python v3 URL and Page

The document is a Python script that crawls websites to search for specific keywords, utilizing the requests and BeautifulSoup libraries for web scraping. It includes functions for checking keywords on individual pages, crawling multiple domains, and processing results with error handling and logging. The script is designed to handle timeouts and irrelevant URLs while maintaining a session for efficiency.

Uploaded by

govindshukla2003
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
3 views4 pages

Python v3 URL and Page

The document is a Python script that crawls websites to search for specific keywords, utilizing the requests and BeautifulSoup libraries for web scraping. It includes functions for checking keywords on individual pages, crawling multiple domains, and processing results with error handling and logging. The script is designed to handle timeouts and irrelevant URLs while maintaining a session for efficiency.

Uploaded by

govindshukla2003
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 4

import requests

from bs4 import BeautifulSoup


import re
from urllib.parse import urljoin, urlparse
import concurrent.futures
import time
import logging

# Set up logging to capture potential issues.


logging.basicConfig(level=logging.WARNING, format='%(asctime)s - %(levelname)s - %
(message)s')

def find_keywords_on_page(url, keywords, case_sensitive=False):


"""Checks if keywords are present on a single web page and returns found
keywords with URL.
Returns a dictionary of found keywords and their URLs. Returns None on
error."""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
'Connection': 'keep-alive',
}
try:
session = requests.Session()
response = session.get(url, timeout=10, headers=headers)
response.raise_for_status()
response.encoding = response.apparent_encoding
try:
soup = BeautifulSoup(response.content, 'html.parser',
from_encoding=response.encoding)
except Exception as e:
logging.error(f"Error parsing HTML with BeautifulSoup for {url}: {e}")
session.close()
return None

# Check for language metadata here as well


if soup.find('html'):
lang = soup.find('html').get('lang', 'en')
if not lang.startswith('en'):
session.close()
return None
text = soup.get_text(separator=' ', strip=True).lower() if not
case_sensitive else soup.get_text(separator=' ', strip=True)
found_keywords = {}
for keyword in keywords:
search_term = keyword.lower() if not case_sensitive else keyword
if re.search(r'\b' + re.escape(search_term) + r'\b', text):
found_keywords[keyword] = url
session.close()
return found_keywords
except requests.exceptions.RequestException:
return None
except Exception:
return None

def crawl_and_search(start_url, keywords, session, timeout=300):


"""Crawls a website and searches for keywords, stopping if any keyword appears
in the URL.
Returns a dictionary of found keywords and their URLs. Returns None on error.
Uses a shared session."""
visited = set()
queue = [start_url]
found_matches = {}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
'Connection': 'keep-alive',
}
irrelevant_words = ["member", "provider"]
start_time = time.time()
current_domain = urlparse(start_url).netloc

while queue:
current_url = queue.pop(0)
if current_url in visited:
continue
visited.add(current_url)

# Check for timeout at the beginning of each iteration


if time.time() - start_time > timeout:
print(f"Crawling of {current_domain} stopped due to timeout (>
{timeout} seconds).")
return None

# Check if any keyword is in the URL itself


for keyword in keywords:
if keyword.lower() in current_url.lower() if not case_sensitive else
keyword in current_url:
#print(f"Match found in URL: {current_url}")
return {keyword: current_url}

# Skip URLs containing irrelevant words


parsed_url = urlparse(current_url)
path_segments = [segment.lower() for segment in parsed_url.path.split('/')
if segment]
if any(word in path_segments for word in irrelevant_words):
#print(f"Skipping URL due to irrelevant words: {current_url}")
continue

try:
response = session.get(current_url, timeout=10, headers=headers)
response.raise_for_status()
response.encoding = response.apparent_encoding

try:
soup = BeautifulSoup(response.content, 'html.parser',
from_encoding=response.encoding)
except Exception as e:
logging.error(f"Error parsing HTML with BeautifulSoup for
{current_url}: {e}")
continue

# Check for language


if soup.find('html'):
lang = soup.find('html').get('lang', 'en')
if not lang.startswith('en'):
#print(f"Skipping non-English page: {current_url}")
continue # Skip non-English pages

links = [urljoin(current_url, link['href']) for link in


soup.find_all('a', href=True)]
page_keywords = find_keywords_on_page(current_url, keywords)
if page_keywords:
found_matches.update(page_keywords)
for absolute_url in links:
if absolute_url.startswith(start_url) and absolute_url not in
visited:
queue.append(absolute_url)

except requests.exceptions.HTTPError as e:
if e.response.status_code in [404, 403]:
#print(f"Skipping URL due to {e.response.status_code} error:
{current_url}")
continue # Skip this URL and continue with the next one
else:
#print(f"Error fetching URL {current_url}: {e}")
return None
except requests.exceptions.RequestException as e:
#print(f"Error fetching URL {current_url}: {e}")
return None
except Exception as e:
print(f"Error processing URL {current_url}: {e}")
pass
return found_matches

def process_domain(url, keywords, session, timeout=300):


"""Helper function to process a single domain.
Now takes a session and a timeout."""
start_time = time.time()
result = crawl_and_search(url, keywords, session, timeout)
end_time = time.time()
if result is None: # Check for None (timeout or other error)
return None
if end_time - start_time > timeout:
print(f"Crawling of {url} stopped due to timeout (> {timeout} seconds).")
return None
return result

def search_multiple_domains_from_file(filename, keywords, timeout=300):


"""Searches multiple domains from a text file for keywords using a thread
pool."""
try:
with open(filename, 'r') as file:
urls = [line.strip() for line in file if line.strip()]
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
session = requests.Session()
future_to_url = {executor.submit(process_domain, url, keywords,
session, timeout): url for url in urls}
for future in concurrent.futures.as_completed(future_to_url):
url = future_to_url[future]
try:
result = future.result()
if result:
print(f"\nSearching: {url}")
print(f"Found matches in {url}:")
for keyword, match_url in result.items():
print(f" Keyword: {keyword}, URL: {match_url}")
except Exception as e:
print(f"Error processing domain {url}: {e}")
pass
session.close()

except FileNotFoundError:
print(f"Error: File '{filename}' not found.")

if __name__ == "__main__":
domains_file = "domains.txt"
search_keywords = ["RFP", "Proposal", "Procurement", "Bid"]
case_sensitive = False
timeout_value = 300

start_time = time.time()
search_multiple_domains_from_file(domains_file, search_keywords,
timeout=timeout_value)
end_time = time.time()
print(f"Total time taken: {end_time - start_time:.2f} seconds")

You might also like