import asyncio
import aiohttp
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
import json
import csv
import time
from datetime import datetime, timedelta
from urllib.parse import urljoin, urlparse, parse_qs
from dataclasses import dataclass, asdict
from typing import List, Dict, Optional, Set, Callable, Any, Tuple
import hashlib
import sqlite3
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
import threading
import queue
import logging
from fake_useragent import UserAgent
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter, defaultdict
import nltk
from textblob import TextBlob
import statistics
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %
(message)s')
logger = logging.getLogger(__name__)
@dataclass
class ScrapedData:
url: str
title: str
content: str
metadata: Dict[str, Any]
timestamp: datetime
hash_id: str
def __post_init__(self):
if not self.hash_id:
self.hash_id = hashlib.md5(f"{self.url}
{self.content}".encode()).hexdigest()
class RateLimiter:
def __init__(self, max_requests: int, time_window: int):
self.max_requests = max_requests
self.time_window = time_window
self.requests = []
self.lock = threading.Lock()
def acquire(self):
with self.lock:
now = time.time()
# Remove old requests outside the time window
self.requests = [req_time for req_time in self.requests if now -
req_time < self.time_window]
if len(self.requests) >= self.max_requests:
sleep_time = self.time_window - (now - self.requests[0])
if sleep_time > 0:
time.sleep(sleep_time)
return self.acquire()
self.requests.append(now)
return True
class ProxyRotator:
def __init__(self, proxy_list: List[str]):
self.proxies = proxy_list
self.current_index = 0
self.failed_proxies = set()
self.lock = threading.Lock()
def get_proxy(self) -> Optional[str]:
with self.lock:
if len(self.failed_proxies) >= len(self.proxies):
return None
for _ in range(len(self.proxies)):
proxy = self.proxies[self.current_index]
self.current_index = (self.current_index + 1) % len(self.proxies)
if proxy not in self.failed_proxies:
return proxy
return None
def mark_failed(self, proxy: str):
with self.lock:
self.failed_proxies.add(proxy)
class ContentExtractor:
def __init__(self):
self.selectors = {
'title': ['h1', 'title', '.title', '#title', '[class*="title"]'],
'content': ['p', '.content', '.article', '.post',
'[class*="content"]'],
'links': ['a[href]'],
'images': ['img[src]'],
'meta_description': ['meta[name="description"]'],
'meta_keywords': ['meta[name="keywords"]']
}
def extract_text(self, soup: BeautifulSoup, element_type: str) -> List[str]:
elements = []
for selector in self.selectors.get(element_type, []):
found = soup.select(selector)
elements.extend([elem.get_text(strip=True) for elem in found if
elem.get_text(strip=True)])
return elements
def extract_links(self, soup: BeautifulSoup, base_url: str) -> List[Dict[str,
str]]:
links = []
for link in soup.find_all('a', href=True):
href = link['href']
absolute_url = urljoin(base_url, href)
links.append({
'url': absolute_url,
'text': link.get_text(strip=True),
'title': link.get('title', '')
})
return links
def extract_metadata(self, soup: BeautifulSoup) -> Dict[str, Any]:
metadata = {}
# Meta tags
for meta in soup.find_all('meta'):
name = meta.get('name') or meta.get('property')
content = meta.get('content')
if name and content:
metadata[name] = content
# Page statistics
metadata['word_count'] = len(soup.get_text().split())
metadata['link_count'] = len(soup.find_all('a'))
metadata['image_count'] = len(soup.find_all('img'))
metadata['heading_count'] = len(soup.find_all(['h1', 'h2', 'h3', 'h4',
'h5', 'h6']))
return metadata
class TextAnalyzer:
def __init__(self):
# Download required NLTK data
try:
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('vader_lexicon', quiet=True)
except:
pass
def analyze_text(self, text: str) -> Dict[str, Any]:
analysis = {}
# Basic statistics
words = text.split()
analysis['word_count'] = len(words)
analysis['char_count'] = len(text)
analysis['sentence_count'] = len(re.split(r'[.!?]+', text))
analysis['avg_word_length'] = np.mean([len(word) for word in words]) if
words else 0
# Readability metrics
analysis['flesch_reading_ease'] = self.calculate_flesch_score(text)
# Sentiment analysis
try:
blob = TextBlob(text)
analysis['sentiment_polarity'] = blob.sentiment.polarity
analysis['sentiment_subjectivity'] = blob.sentiment.subjectivity
except:
analysis['sentiment_polarity'] = 0
analysis['sentiment_subjectivity'] = 0
# Keyword extraction
analysis['top_words'] = self.extract_keywords(text, top_n=10)
return analysis
def calculate_flesch_score(self, text: str) -> float:
sentences = len(re.split(r'[.!?]+', text))
words = len(text.split())
syllables = sum([self.count_syllables(word) for word in text.split()])
if sentences == 0 or words == 0:
return 0
score = 206.835 - (1.015 * (words / sentences)) - (84.6 * (syllables /
words))
return max(0, min(100, score))
def count_syllables(self, word: str) -> int:
word = word.lower()
vowels = "aeiouy"
syllable_count = 0
previous_char_was_vowel = False
for char in word:
is_vowel = char in vowels
if is_vowel and not previous_char_was_vowel:
syllable_count += 1
previous_char_was_vowel = is_vowel
if word.endswith('e'):
syllable_count -= 1
return max(1, syllable_count)
def extract_keywords(self, text: str, top_n: int = 10) -> List[Tuple[str,
int]]:
# Simple keyword extraction based on frequency
words = re.findall(r'\b[a-zA-Z]{3,}\b', text.lower())
# Remove common stop words
stop_words = {'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
'of', 'with', 'by', 'this', 'that', 'these', 'those'}
words = [word for word in words if word not in stop_words]
word_freq = Counter(words)
return word_freq.most_common(top_n)
class DatabaseManager:
def __init__(self, db_path: str = "scraping_data.db"):
self.db_path = db_path
self.init_database()
def init_database(self):
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
cursor.execute('''
CREATE TABLE IF NOT EXISTS scraped_data (
id INTEGER PRIMARY KEY AUTOINCREMENT,
url TEXT UNIQUE,
title TEXT,
content TEXT,
metadata TEXT,
timestamp TEXT,
hash_id TEXT UNIQUE
)
''')
cursor.execute('''
CREATE TABLE IF NOT EXISTS analytics (
id INTEGER PRIMARY KEY AUTOINCREMENT,
data_id INTEGER,
analysis TEXT,
created_at TEXT,
FOREIGN KEY (data_id) REFERENCES scraped_data (id)
)
''')
conn.commit()
conn.close()
def save_data(self, data: ScrapedData) -> bool:
try:
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
cursor.execute('''
INSERT OR REPLACE INTO scraped_data
(url, title, content, metadata, timestamp, hash_id)
VALUES (?, ?, ?, ?, ?, ?)
''', (
data.url,
data.title,
data.content,
json.dumps(data.metadata),
data.timestamp.isoformat(),
data.hash_id
))
conn.commit()
conn.close()
return True
except Exception as e:
logger.error(f"Database save error: {e}")
return False
def get_all_data(self) -> List[ScrapedData]:
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
cursor.execute('SELECT * FROM scraped_data')
rows = cursor.fetchall()
data_list = []
for row in rows:
data = ScrapedData(
url=row[1],
title=row[2],
content=row[3],
metadata=json.loads(row[4]),
timestamp=datetime.fromisoformat(row[5]),
hash_id=row[6]
)
data_list.append(data)
conn.close()
return data_list
class WebScraper:
def __init__(self, rate_limit: Tuple[int, int] = (10, 60), use_proxy: bool =
False):
self.rate_limiter = RateLimiter(rate_limit[0], rate_limit[1])
self.content_extractor = ContentExtractor()
self.text_analyzer = TextAnalyzer()
self.db_manager = DatabaseManager()
self.session = requests.Session()
self.user_agent = UserAgent()
self.proxy_rotator = None
if use_proxy:
# Example proxy list - in practice, you'd load from a file or service
proxy_list = ['http://proxy1:8080', 'http://proxy2:8080']
self.proxy_rotator = ProxyRotator(proxy_list)
def scrape_url(self, url: str, extract_links: bool = False) ->
Optional[ScrapedData]:
try:
self.rate_limiter.acquire()
headers = {'User-Agent': self.user_agent.random}
proxies = None
if self.proxy_rotator:
proxy = self.proxy_rotator.get_proxy()
if proxy:
proxies = {'http': proxy, 'https': proxy}
response = self.session.get(url, headers=headers, proxies=proxies,
timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Extract content
title_elements = self.content_extractor.extract_text(soup, 'title')
title = title_elements[0] if title_elements else ''
content_elements = self.content_extractor.extract_text(soup, 'content')
content = ' '.join(content_elements)
# Extract metadata
metadata = self.content_extractor.extract_metadata(soup)
metadata['status_code'] = response.status_code
metadata['content_type'] = response.headers.get('content-type', '')
if extract_links:
metadata['links'] = self.content_extractor.extract_links(soup, url)
# Analyze text
text_analysis = self.text_analyzer.analyze_text(content)
metadata.update(text_analysis)
# Create scraped data object
scraped_data = ScrapedData(
url=url,
title=title,
content=content,
metadata=metadata,
timestamp=datetime.now(),
hash_id=''
)
# Save to database
self.db_manager.save_data(scraped_data)
logger.info(f"Successfully scraped: {url}")
return scraped_data
except Exception as e:
logger.error(f"Error scraping {url}: {e}")
if self.proxy_rotator and proxies:
self.proxy_rotator.mark_failed(proxies['http'])
return None
def scrape_multiple(self, urls: List[str], max_workers: int = 5) ->
List[ScrapedData]:
results = []
with ThreadPoolExecutor(max_workers=max_workers) as executor:
future_to_url = {executor.submit(self.scrape_url, url): url for url in
urls}
for future in future_to_url:
try:
result = future.result(timeout=30)
if result:
results.append(result)
except Exception as e:
logger.error(f"Error in thread execution: {e}")
return results
def crawl_website(self, start_url: str, max_depth: int = 2, max_pages: int =
50) -> List[ScrapedData]:
visited = set()
to_visit = [(start_url, 0)] # (url, depth)
results = []
while to_visit and len(results) < max_pages:
current_url, depth = to_visit.pop(0)
if current_url in visited or depth > max_depth:
continue
visited.add(current_url)
scraped_data = self.scrape_url(current_url, extract_links=True)
if scraped_data:
results.append(scraped_data)
# Add new links to crawl
if depth < max_depth and 'links' in scraped_data.metadata:
domain = urlparse(start_url).netloc
for link in scraped_data.metadata['links']:
link_url = link['url']
link_domain = urlparse(link_url).netloc
# Only crawl links from the same domain
if link_domain == domain and link_url not in visited:
to_visit.append((link_url, depth + 1))
return results
class DataAnalyzer:
def __init__(self, db_manager: DatabaseManager):
self.db_manager = db_manager
def generate_report(self) -> Dict[str, Any]:
data = self.db_manager.get_all_data()
if not data:
return {"error": "No data available"}
report = {
"total_pages": len(data),
"date_range": {
"start": min(d.timestamp for d in data).isoformat(),
"end": max(d.timestamp for d in data).isoformat()
},
"content_stats": self._analyze_content(data),
"domain_analysis": self._analyze_domains(data),
"sentiment_analysis": self._analyze_sentiment(data),
"keyword_analysis": self._analyze_keywords(data)
}
return report
def _analyze_content(self, data: List[ScrapedData]) -> Dict[str, Any]:
word_counts = [d.metadata.get('word_count', 0) for d in data]
reading_scores = [d.metadata.get('flesch_reading_ease', 0) for d in data]
return {
"avg_word_count": statistics.mean(word_counts) if word_counts else 0,
"median_word_count": statistics.median(word_counts) if word_counts else
0,
"avg_reading_ease": statistics.mean(reading_scores) if reading_scores
else 0,
"total_content_length": sum(len(d.content) for d in data)
}
def _analyze_domains(self, data: List[ScrapedData]) -> Dict[str, int]:
domains = [urlparse(d.url).netloc for d in data]
return dict(Counter(domains).most_common(10))
def _analyze_sentiment(self, data: List[ScrapedData]) -> Dict[str, float]:
polarities = [d.metadata.get('sentiment_polarity', 0) for d in data]
subjectivities = [d.metadata.get('sentiment_subjectivity', 0) for d in
data]
return {
"avg_polarity": statistics.mean(polarities) if polarities else 0,
"avg_subjectivity": statistics.mean(subjectivities) if subjectivities
else 0
}
def _analyze_keywords(self, data: List[ScrapedData]) -> List[Tuple[str, int]]:
all_keywords = []
for d in data:
keywords = d.metadata.get('top_words', [])
all_keywords.extend([word for word, count in keywords])
return Counter(all_keywords).most_common(20)
def create_visualizations(self):
data = self.db_manager.get_all_data()
if not data:
print("No data available for visualization")
return
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
# Word count distribution
word_counts = [d.metadata.get('word_count', 0) for d in data]
axes[0, 0].hist(word_counts, bins=20, alpha=0.7)
axes[0, 0].set_title('Word Count Distribution')
axes[0, 0].set_xlabel('Word Count')
axes[0, 0].set_ylabel('Frequency')
# Sentiment analysis
polarities = [d.metadata.get('sentiment_polarity', 0) for d in data]
axes[0, 1].hist(polarities, bins=20, alpha=0.7, color='green')
axes[0, 1].set_title('Sentiment Polarity Distribution')
axes[0, 1].set_xlabel('Polarity (-1 to 1)')
axes[0, 1].set_ylabel('Frequency')
# Reading ease scores
reading_scores = [d.metadata.get('flesch_reading_ease', 0) for d in data]
axes[1, 0].hist(reading_scores, bins=20, alpha=0.7, color='orange')
axes[1, 0].set_title('Reading Ease Scores')
axes[1, 0].set_xlabel('Flesch Reading Ease')
axes[1, 0].set_ylabel('Frequency')
# Domain distribution
domains = [urlparse(d.url).netloc for d in data]
domain_counts = Counter(domains).most_common(10)
if domain_counts:
domain_names, counts = zip(*domain_counts)
axes[1, 1].bar(range(len(domain_names)), counts)
axes[1, 1].set_title('Top Domains')
axes[1, 1].set_xlabel('Domain')
axes[1, 1].set_ylabel('Page Count')
axes[1, 1].tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.show()
def main():
"""Demonstration of the web scraping framework"""
print("Web Scraping & Analytics Framework Demo")
print("=" * 50)
# Initialize scraper
scraper = WebScraper(rate_limit=(5, 60), use_proxy=False)
# Example URLs to scrape
urls = [
'https://httpbin.org/html',
'https://httpbin.org/json',
'https://httpbin.org/xml'
]
print(f"Scraping {len(urls)} URLs...")
# Scrape multiple URLs
results = scraper.scrape_multiple(urls, max_workers=3)
print(f"Successfully scraped {len(results)} pages")
# Generate analytics report
analyzer = DataAnalyzer(scraper.db_manager)
report = analyzer.generate_report()
print("\nAnalytics Report:")
print(json.dumps(report, indent=2, default=str))
# Export data to CSV
data = scraper.db_manager.get_all_data()
if data:
df = pd.DataFrame([{
'url': d.url,
'title': d.title,
'word_count': d.metadata.get('word_count', 0),
'sentiment': d.metadata.get('sentiment_polarity', 0),
'timestamp': d.timestamp
} for d in data])
df.to_csv('scraped_data.csv', index=False)
print(f"\nExported {len(df)} records to scraped_data.csv")
# Create visualizations (would work if matplotlib is available)
try:
analyzer.create_visualizations()
except Exception as e:
print(f"Visualization error: {e}")
if __name__ == "__main__":
main()