Python v3 URL and Page

The document is a Python script that crawls websites to search for specific keywords, utilizing the requests and BeautifulSoup libraries for web scraping. It includes functions for checking keywords on individual pages, crawling multiple domains, and processing results with error handling and logging. The script is designed to handle timeouts and irrelevant URLs while maintaining a session for efficiency.

Uploaded by

govindshukla2003

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

3 views4 pages

Python v3 URL and Page

Uploaded by

govindshukla2003

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

You are on page 1/ 4

import requests

from bs4 import BeautifulSoup

import re
from urllib.parse import urljoin, urlparse
import concurrent.futures
import time
import logging

# Set up logging to capture potential issues.

logging.basicConfig(level=logging.WARNING, format='%(asctime)s - %(levelname)s - %
(message)s')

def find_keywords_on_page(url, keywords, case_sensitive=False):

"""Checks if keywords are present on a single web page and returns found
keywords with URL.
Returns a dictionary of found keywords and their URLs. Returns None on
error."""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
'Connection': 'keep-alive',
}
try:
session = requests.Session()
response = session.get(url, timeout=10, headers=headers)
response.raise_for_status()
response.encoding = response.apparent_encoding
try:
soup = BeautifulSoup(response.content, 'html.parser',
from_encoding=response.encoding)
except Exception as e:
logging.error(f"Error parsing HTML with BeautifulSoup for {url}: {e}")
session.close()
return None

# Check for language metadata here as well

if soup.find('html'):
lang = soup.find('html').get('lang', 'en')
if not lang.startswith('en'):
session.close()
return None
text = soup.get_text(separator=' ', strip=True).lower() if not
case_sensitive else soup.get_text(separator=' ', strip=True)
found_keywords = {}
for keyword in keywords:
search_term = keyword.lower() if not case_sensitive else keyword
if re.search(r'\b' + re.escape(search_term) + r'\b', text):
found_keywords[keyword] = url
session.close()
return found_keywords
except requests.exceptions.RequestException:
return None
except Exception:
return None

def crawl_and_search(start_url, keywords, session, timeout=300):

"""Crawls a website and searches for keywords, stopping if any keyword appears
in the URL.
Returns a dictionary of found keywords and their URLs. Returns None on error.
Uses a shared session."""
visited = set()
queue = [start_url]
found_matches = {}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
'Connection': 'keep-alive',
}
irrelevant_words = ["member", "provider"]
start_time = time.time()
current_domain = urlparse(start_url).netloc

while queue:
current_url = queue.pop(0)
if current_url in visited:
continue
visited.add(current_url)

# Check for timeout at the beginning of each iteration

if time.time() - start_time > timeout:
print(f"Crawling of {current_domain} stopped due to timeout (>
{timeout} seconds).")
return None

# Check if any keyword is in the URL itself

for keyword in keywords:
if keyword.lower() in current_url.lower() if not case_sensitive else
keyword in current_url:
#print(f"Match found in URL: {current_url}")
return {keyword: current_url}

# Skip URLs containing irrelevant words

parsed_url = urlparse(current_url)
path_segments = [segment.lower() for segment in parsed_url.path.split('/')
if segment]
if any(word in path_segments for word in irrelevant_words):
#print(f"Skipping URL due to irrelevant words: {current_url}")
continue

try:
response = session.get(current_url, timeout=10, headers=headers)
response.raise_for_status()
response.encoding = response.apparent_encoding

try:
soup = BeautifulSoup(response.content, 'html.parser',
from_encoding=response.encoding)
except Exception as e:
logging.error(f"Error parsing HTML with BeautifulSoup for
{current_url}: {e}")
continue

# Check for language

if soup.find('html'):
lang = soup.find('html').get('lang', 'en')
if not lang.startswith('en'):
#print(f"Skipping non-English page: {current_url}")
continue # Skip non-English pages

links = [urljoin(current_url, link['href']) for link in

soup.find_all('a', href=True)]
page_keywords = find_keywords_on_page(current_url, keywords)
if page_keywords:
found_matches.update(page_keywords)
for absolute_url in links:
if absolute_url.startswith(start_url) and absolute_url not in
visited:
queue.append(absolute_url)

except requests.exceptions.HTTPError as e:
if e.response.status_code in [404, 403]:
#print(f"Skipping URL due to {e.response.status_code} error:
{current_url}")
continue # Skip this URL and continue with the next one
else:
#print(f"Error fetching URL {current_url}: {e}")
return None
except requests.exceptions.RequestException as e:
#print(f"Error fetching URL {current_url}: {e}")
return None
except Exception as e:
print(f"Error processing URL {current_url}: {e}")
pass
return found_matches

def process_domain(url, keywords, session, timeout=300):

"""Helper function to process a single domain.
Now takes a session and a timeout."""
start_time = time.time()
result = crawl_and_search(url, keywords, session, timeout)
end_time = time.time()
if result is None: # Check for None (timeout or other error)
return None
if end_time - start_time > timeout:
print(f"Crawling of {url} stopped due to timeout (> {timeout} seconds).")
return None
return result

def search_multiple_domains_from_file(filename, keywords, timeout=300):

"""Searches multiple domains from a text file for keywords using a thread
pool."""
try:
with open(filename, 'r') as file:
urls = [line.strip() for line in file if line.strip()]
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
session = requests.Session()
future_to_url = {executor.submit(process_domain, url, keywords,
session, timeout): url for url in urls}
for future in concurrent.futures.as_completed(future_to_url):
url = future_to_url[future]
try:
result = future.result()
if result:
print(f"\nSearching: {url}")
print(f"Found matches in {url}:")
for keyword, match_url in result.items():
print(f" Keyword: {keyword}, URL: {match_url}")
except Exception as e:
print(f"Error processing domain {url}: {e}")
pass
session.close()

except FileNotFoundError:
print(f"Error: File '{filename}' not found.")

if __name__ == "__main__":
domains_file = "domains.txt"
search_keywords = ["RFP", "Proposal", "Procurement", "Bid"]
case_sensitive = False
timeout_value = 300

start_time = time.time()
search_multiple_domains_from_file(domains_file, search_keywords,
timeout=timeout_value)
end_time = time.time()
print(f"Total time taken: {end_time - start_time:.2f} seconds")

WSMA Lab Manual 2
No ratings yet
WSMA Lab Manual 2
8 pages
Another Hack Test3
No ratings yet
Another Hack Test3
4 pages
Subdomain Scanner
No ratings yet
Subdomain Scanner
2 pages
Sans Titre
No ratings yet
Sans Titre
11 pages
Basic Scraping Techniques
No ratings yet
Basic Scraping Techniques
7 pages
Airdrop 3
No ratings yet
Airdrop 3
35 pages
Assignment 1
No ratings yet
Assignment 1
5 pages
.txt
No ratings yet
.txt
2 pages
Web Scraping and Data Collection CheatSheet 1731972399
No ratings yet
Web Scraping and Data Collection CheatSheet 1731972399
10 pages
Information Retrieval WA
No ratings yet
Information Retrieval WA
9 pages
Another Hack Test9
No ratings yet
Another Hack Test9
2 pages
77 Final
No ratings yet
77 Final
24 pages
Googlemap
No ratings yet
Googlemap
2 pages
Ballerono Cappuchino
No ratings yet
Ballerono Cappuchino
10 pages
Message
No ratings yet
Message
3 pages
Message
No ratings yet
Message
3 pages
Programming Assignment Unit 07 - CS 3308 - Information Retrieval - University of The People
No ratings yet
Programming Assignment Unit 07 - CS 3308 - Information Retrieval - University of The People
4 pages
Duckduckgo Download
No ratings yet
Duckduckgo Download
3 pages
Real Estate Scraper
No ratings yet
Real Estate Scraper
23 pages
Python Cheat Set
No ratings yet
Python Cheat Set
1 page
Web Scraper-Document
No ratings yet
Web Scraper-Document
2 pages
Hybrid Scraping Techniques
No ratings yet
Hybrid Scraping Techniques
8 pages
77 Main 2
No ratings yet
77 Main 2
13 pages
Utils
No ratings yet
Utils
4 pages
77 Main
No ratings yet
77 Main
17 pages
DH
No ratings yet
DH
4 pages
De1 GK NHKTLT
No ratings yet
De1 GK NHKTLT
12 pages
03 Web Scraping
No ratings yet
03 Web Scraping
41 pages
A Simple Python Web Crawler...
100% (1)
A Simple Python Web Crawler...
5 pages
Cs 3308 Unit 7 Programming Assignment
No ratings yet
Cs 3308 Unit 7 Programming Assignment
8 pages
Python Script
No ratings yet
Python Script
2 pages
App
No ratings yet
App
10 pages
Airdrop
No ratings yet
Airdrop
24 pages
Multithreading Crawler Project OS
No ratings yet
Multithreading Crawler Project OS
11 pages
77 Main 3
No ratings yet
77 Main 3
13 pages
077 Main
No ratings yet
077 Main
13 pages
Tool
No ratings yet
Tool
3 pages
Fun With Python
100% (5)
Fun With Python
113 pages
Practical7 IR
No ratings yet
Practical7 IR
3 pages
Introduction To Web Crawling Chapter - 13
No ratings yet
Introduction To Web Crawling Chapter - 13
3 pages
Pseudocodes and Flowcharts (Riyansha Shahare)
No ratings yet
Pseudocodes and Flowcharts (Riyansha Shahare)
14 pages
Trip Planner Example
No ratings yet
Trip Planner Example
7 pages
Agent 301
No ratings yet
Agent 301
5 pages
Beautifulsoap4 Experiments
No ratings yet
Beautifulsoap4 Experiments
7 pages
New Text Document
No ratings yet
New Text Document
7 pages
DAP - Module 4
No ratings yet
DAP - Module 4
57 pages
DAP Module4
No ratings yet
DAP Module4
109 pages
Python Web Crawler
No ratings yet
Python Web Crawler
15 pages
Scrapping The Web
100% (1)
Scrapping The Web
13 pages
Scraperskank
No ratings yet
Scraperskank
3 pages
Domain SQLi Finder - Py
No ratings yet
Domain SQLi Finder - Py
13 pages
Message
No ratings yet
Message
3 pages
Web Scrapping
No ratings yet
Web Scrapping
3 pages
Chapter 11. Web Scraping
100% (1)
Chapter 11. Web Scraping
57 pages
Create - Folder - If - Not - Exists: STR None
No ratings yet
Create - Folder - If - Not - Exists: STR None
5 pages
Introduction to PHP, Part 5, Second Edition
From Everand
Introduction to PHP, Part 5, Second Edition
Adam Majczak
No ratings yet
Simplified PHP
From Everand
Simplified PHP
James Blanchette
No ratings yet
50 Recipes for Programming Node.js
From Everand
50 Recipes for Programming Node.js
Jamie Munro
3/5 (4)
Python Reference: An Alphabetical Guide
From Everand
Python Reference: An Alphabetical Guide
Jo Foster
No ratings yet
50 Recipes for Programming Angular
From Everand
50 Recipes for Programming Angular
Jamie Munro
4/5 (1)
Aditya's Resume
No ratings yet
Aditya's Resume
1 page
Jquery Tutorials: Why You Need To Learn Jquery
No ratings yet
Jquery Tutorials: Why You Need To Learn Jquery
2 pages
CSS Background 4r
No ratings yet
CSS Background 4r
3 pages
الرياضيات - الأولى باك علوم رياضية - آلوسكول PDF
100% (1)
الرياضيات - الأولى باك علوم رياضية - آلوسكول PDF
11 pages
HTML One Mark Questions
No ratings yet
HTML One Mark Questions
29 pages
On-Page SEO - The Definitive Guide (2021)
No ratings yet
On-Page SEO - The Definitive Guide (2021)
116 pages
UNIT1pptx 2023 07 28 13 01 57
No ratings yet
UNIT1pptx 2023 07 28 13 01 57
209 pages
Angular 15 Programs
No ratings yet
Angular 15 Programs
47 pages
Sec13 Paper Springborn
No ratings yet
Sec13 Paper Springborn
17 pages
Interview Question
No ratings yet
Interview Question
4 pages
Go Web Frameworks
No ratings yet
Go Web Frameworks
3 pages
Student Assignments
No ratings yet
Student Assignments
61 pages
Ordered List HTML
No ratings yet
Ordered List HTML
5 pages
Log File
No ratings yet
Log File
2 pages
How To Buy Verified Cash App Accounts - Safe and Secure...
No ratings yet
How To Buy Verified Cash App Accounts - Safe and Secure...
27 pages
Course Introduction: Real World HTML Project
No ratings yet
Course Introduction: Real World HTML Project
3 pages
Practical File of Computer Application - Ii: Bba 4 Semester
No ratings yet
Practical File of Computer Application - Ii: Bba 4 Semester
44 pages
m3u8视频侦测下载器【自动嗅探】 user
0% (1)
m3u8视频侦测下载器【自动嗅探】 user
11 pages
Khadka Hritik
No ratings yet
Khadka Hritik
39 pages
WDK For Portlets 5.3 SP1 Development
No ratings yet
WDK For Portlets 5.3 SP1 Development
120 pages
Jeneesh Report
No ratings yet
Jeneesh Report
44 pages
Chat GPT
No ratings yet
Chat GPT
50 pages
Nextbridge
No ratings yet
Nextbridge
8 pages
Creative Web Development Dubai
No ratings yet
Creative Web Development Dubai
6 pages
Web Technology Lab File Vaibhav PDF
No ratings yet
Web Technology Lab File Vaibhav PDF
51 pages
Open, de Jure, de Facto and Proprietary: Standards and Microsoft
No ratings yet
Open, de Jure, de Facto and Proprietary: Standards and Microsoft
18 pages
Express Cheatsheet
No ratings yet
Express Cheatsheet
6 pages
Question Bank For Science IT Practicals
No ratings yet
Question Bank For Science IT Practicals
2 pages
The Ultimate HTML Handbook - Removed
No ratings yet
The Ultimate HTML Handbook - Removed
18 pages
HTML Notes
No ratings yet
HTML Notes
20 pages