0% found this document useful (0 votes)
5 views10 pages

Ballerono Cappuchino

The document outlines a comprehensive web scraping framework using Python, incorporating various libraries such as BeautifulSoup, requests, and SQLite for data management. It features classes for scraping, content extraction, text analysis, and data storage, along with rate limiting and proxy rotation for efficient scraping. Additionally, it includes functionalities for data analysis and visualization, providing insights into the scraped content.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
5 views10 pages

Ballerono Cappuchino

The document outlines a comprehensive web scraping framework using Python, incorporating various libraries such as BeautifulSoup, requests, and SQLite for data management. It features classes for scraping, content extraction, text analysis, and data storage, along with rate limiting and proxy rotation for efficient scraping. Additionally, it includes functionalities for data analysis and visualization, providing insights into the scraped content.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 10

import asyncio

import aiohttp
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
import json
import csv
import time
from datetime import datetime, timedelta
from urllib.parse import urljoin, urlparse, parse_qs
from dataclasses import dataclass, asdict
from typing import List, Dict, Optional, Set, Callable, Any, Tuple
import hashlib
import sqlite3
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
import threading
import queue
import logging
from fake_useragent import UserAgent
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter, defaultdict
import nltk
from textblob import TextBlob
import statistics

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %
(message)s')
logger = logging.getLogger(__name__)

@dataclass
class ScrapedData:
url: str
title: str
content: str
metadata: Dict[str, Any]
timestamp: datetime
hash_id: str

def __post_init__(self):
if not self.hash_id:
self.hash_id = hashlib.md5(f"{self.url}
{self.content}".encode()).hexdigest()

class RateLimiter:
def __init__(self, max_requests: int, time_window: int):
self.max_requests = max_requests
self.time_window = time_window
self.requests = []
self.lock = threading.Lock()

def acquire(self):
with self.lock:
now = time.time()
# Remove old requests outside the time window
self.requests = [req_time for req_time in self.requests if now -
req_time < self.time_window]

if len(self.requests) >= self.max_requests:


sleep_time = self.time_window - (now - self.requests[0])
if sleep_time > 0:
time.sleep(sleep_time)
return self.acquire()

self.requests.append(now)
return True

class ProxyRotator:
def __init__(self, proxy_list: List[str]):
self.proxies = proxy_list
self.current_index = 0
self.failed_proxies = set()
self.lock = threading.Lock()

def get_proxy(self) -> Optional[str]:


with self.lock:
if len(self.failed_proxies) >= len(self.proxies):
return None

for _ in range(len(self.proxies)):
proxy = self.proxies[self.current_index]
self.current_index = (self.current_index + 1) % len(self.proxies)

if proxy not in self.failed_proxies:


return proxy

return None

def mark_failed(self, proxy: str):


with self.lock:
self.failed_proxies.add(proxy)

class ContentExtractor:
def __init__(self):
self.selectors = {
'title': ['h1', 'title', '.title', '#title', '[class*="title"]'],
'content': ['p', '.content', '.article', '.post',
'[class*="content"]'],
'links': ['a[href]'],
'images': ['img[src]'],
'meta_description': ['meta[name="description"]'],
'meta_keywords': ['meta[name="keywords"]']
}

def extract_text(self, soup: BeautifulSoup, element_type: str) -> List[str]:


elements = []
for selector in self.selectors.get(element_type, []):
found = soup.select(selector)
elements.extend([elem.get_text(strip=True) for elem in found if
elem.get_text(strip=True)])
return elements

def extract_links(self, soup: BeautifulSoup, base_url: str) -> List[Dict[str,


str]]:
links = []
for link in soup.find_all('a', href=True):
href = link['href']
absolute_url = urljoin(base_url, href)
links.append({
'url': absolute_url,
'text': link.get_text(strip=True),
'title': link.get('title', '')
})
return links

def extract_metadata(self, soup: BeautifulSoup) -> Dict[str, Any]:


metadata = {}

# Meta tags
for meta in soup.find_all('meta'):
name = meta.get('name') or meta.get('property')
content = meta.get('content')
if name and content:
metadata[name] = content

# Page statistics
metadata['word_count'] = len(soup.get_text().split())
metadata['link_count'] = len(soup.find_all('a'))
metadata['image_count'] = len(soup.find_all('img'))
metadata['heading_count'] = len(soup.find_all(['h1', 'h2', 'h3', 'h4',
'h5', 'h6']))

return metadata

class TextAnalyzer:
def __init__(self):
# Download required NLTK data
try:
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('vader_lexicon', quiet=True)
except:
pass

def analyze_text(self, text: str) -> Dict[str, Any]:


analysis = {}

# Basic statistics
words = text.split()
analysis['word_count'] = len(words)
analysis['char_count'] = len(text)
analysis['sentence_count'] = len(re.split(r'[.!?]+', text))
analysis['avg_word_length'] = np.mean([len(word) for word in words]) if
words else 0

# Readability metrics
analysis['flesch_reading_ease'] = self.calculate_flesch_score(text)

# Sentiment analysis
try:
blob = TextBlob(text)
analysis['sentiment_polarity'] = blob.sentiment.polarity
analysis['sentiment_subjectivity'] = blob.sentiment.subjectivity
except:
analysis['sentiment_polarity'] = 0
analysis['sentiment_subjectivity'] = 0

# Keyword extraction
analysis['top_words'] = self.extract_keywords(text, top_n=10)

return analysis

def calculate_flesch_score(self, text: str) -> float:


sentences = len(re.split(r'[.!?]+', text))
words = len(text.split())
syllables = sum([self.count_syllables(word) for word in text.split()])

if sentences == 0 or words == 0:
return 0

score = 206.835 - (1.015 * (words / sentences)) - (84.6 * (syllables /


words))
return max(0, min(100, score))

def count_syllables(self, word: str) -> int:


word = word.lower()
vowels = "aeiouy"
syllable_count = 0
previous_char_was_vowel = False

for char in word:


is_vowel = char in vowels
if is_vowel and not previous_char_was_vowel:
syllable_count += 1
previous_char_was_vowel = is_vowel

if word.endswith('e'):
syllable_count -= 1

return max(1, syllable_count)

def extract_keywords(self, text: str, top_n: int = 10) -> List[Tuple[str,


int]]:
# Simple keyword extraction based on frequency
words = re.findall(r'\b[a-zA-Z]{3,}\b', text.lower())

# Remove common stop words


stop_words = {'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
'of', 'with', 'by', 'this', 'that', 'these', 'those'}
words = [word for word in words if word not in stop_words]

word_freq = Counter(words)
return word_freq.most_common(top_n)

class DatabaseManager:
def __init__(self, db_path: str = "scraping_data.db"):
self.db_path = db_path
self.init_database()

def init_database(self):
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
cursor.execute('''
CREATE TABLE IF NOT EXISTS scraped_data (
id INTEGER PRIMARY KEY AUTOINCREMENT,
url TEXT UNIQUE,
title TEXT,
content TEXT,
metadata TEXT,
timestamp TEXT,
hash_id TEXT UNIQUE
)
''')

cursor.execute('''
CREATE TABLE IF NOT EXISTS analytics (
id INTEGER PRIMARY KEY AUTOINCREMENT,
data_id INTEGER,
analysis TEXT,
created_at TEXT,
FOREIGN KEY (data_id) REFERENCES scraped_data (id)
)
''')

conn.commit()
conn.close()

def save_data(self, data: ScrapedData) -> bool:


try:
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()

cursor.execute('''
INSERT OR REPLACE INTO scraped_data
(url, title, content, metadata, timestamp, hash_id)
VALUES (?, ?, ?, ?, ?, ?)
''', (
data.url,
data.title,
data.content,
json.dumps(data.metadata),
data.timestamp.isoformat(),
data.hash_id
))

conn.commit()
conn.close()
return True
except Exception as e:
logger.error(f"Database save error: {e}")
return False

def get_all_data(self) -> List[ScrapedData]:


conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()

cursor.execute('SELECT * FROM scraped_data')


rows = cursor.fetchall()

data_list = []
for row in rows:
data = ScrapedData(
url=row[1],
title=row[2],
content=row[3],
metadata=json.loads(row[4]),
timestamp=datetime.fromisoformat(row[5]),
hash_id=row[6]
)
data_list.append(data)

conn.close()
return data_list

class WebScraper:
def __init__(self, rate_limit: Tuple[int, int] = (10, 60), use_proxy: bool =
False):
self.rate_limiter = RateLimiter(rate_limit[0], rate_limit[1])
self.content_extractor = ContentExtractor()
self.text_analyzer = TextAnalyzer()
self.db_manager = DatabaseManager()
self.session = requests.Session()
self.user_agent = UserAgent()
self.proxy_rotator = None

if use_proxy:
# Example proxy list - in practice, you'd load from a file or service
proxy_list = ['http://proxy1:8080', 'http://proxy2:8080']
self.proxy_rotator = ProxyRotator(proxy_list)

def scrape_url(self, url: str, extract_links: bool = False) ->


Optional[ScrapedData]:
try:
self.rate_limiter.acquire()

headers = {'User-Agent': self.user_agent.random}


proxies = None

if self.proxy_rotator:
proxy = self.proxy_rotator.get_proxy()
if proxy:
proxies = {'http': proxy, 'https': proxy}

response = self.session.get(url, headers=headers, proxies=proxies,


timeout=10)
response.raise_for_status()

soup = BeautifulSoup(response.content, 'html.parser')

# Extract content
title_elements = self.content_extractor.extract_text(soup, 'title')
title = title_elements[0] if title_elements else ''

content_elements = self.content_extractor.extract_text(soup, 'content')


content = ' '.join(content_elements)

# Extract metadata
metadata = self.content_extractor.extract_metadata(soup)
metadata['status_code'] = response.status_code
metadata['content_type'] = response.headers.get('content-type', '')
if extract_links:
metadata['links'] = self.content_extractor.extract_links(soup, url)

# Analyze text
text_analysis = self.text_analyzer.analyze_text(content)
metadata.update(text_analysis)

# Create scraped data object


scraped_data = ScrapedData(
url=url,
title=title,
content=content,
metadata=metadata,
timestamp=datetime.now(),
hash_id=''
)

# Save to database
self.db_manager.save_data(scraped_data)

logger.info(f"Successfully scraped: {url}")


return scraped_data

except Exception as e:
logger.error(f"Error scraping {url}: {e}")
if self.proxy_rotator and proxies:
self.proxy_rotator.mark_failed(proxies['http'])
return None

def scrape_multiple(self, urls: List[str], max_workers: int = 5) ->


List[ScrapedData]:
results = []

with ThreadPoolExecutor(max_workers=max_workers) as executor:


future_to_url = {executor.submit(self.scrape_url, url): url for url in
urls}

for future in future_to_url:


try:
result = future.result(timeout=30)
if result:
results.append(result)
except Exception as e:
logger.error(f"Error in thread execution: {e}")

return results

def crawl_website(self, start_url: str, max_depth: int = 2, max_pages: int =


50) -> List[ScrapedData]:
visited = set()
to_visit = [(start_url, 0)] # (url, depth)
results = []

while to_visit and len(results) < max_pages:


current_url, depth = to_visit.pop(0)

if current_url in visited or depth > max_depth:


continue
visited.add(current_url)
scraped_data = self.scrape_url(current_url, extract_links=True)

if scraped_data:
results.append(scraped_data)

# Add new links to crawl


if depth < max_depth and 'links' in scraped_data.metadata:
domain = urlparse(start_url).netloc
for link in scraped_data.metadata['links']:
link_url = link['url']
link_domain = urlparse(link_url).netloc

# Only crawl links from the same domain


if link_domain == domain and link_url not in visited:
to_visit.append((link_url, depth + 1))

return results

class DataAnalyzer:
def __init__(self, db_manager: DatabaseManager):
self.db_manager = db_manager

def generate_report(self) -> Dict[str, Any]:


data = self.db_manager.get_all_data()

if not data:
return {"error": "No data available"}

report = {
"total_pages": len(data),
"date_range": {
"start": min(d.timestamp for d in data).isoformat(),
"end": max(d.timestamp for d in data).isoformat()
},
"content_stats": self._analyze_content(data),
"domain_analysis": self._analyze_domains(data),
"sentiment_analysis": self._analyze_sentiment(data),
"keyword_analysis": self._analyze_keywords(data)
}

return report

def _analyze_content(self, data: List[ScrapedData]) -> Dict[str, Any]:


word_counts = [d.metadata.get('word_count', 0) for d in data]
reading_scores = [d.metadata.get('flesch_reading_ease', 0) for d in data]

return {
"avg_word_count": statistics.mean(word_counts) if word_counts else 0,
"median_word_count": statistics.median(word_counts) if word_counts else
0,
"avg_reading_ease": statistics.mean(reading_scores) if reading_scores
else 0,
"total_content_length": sum(len(d.content) for d in data)
}

def _analyze_domains(self, data: List[ScrapedData]) -> Dict[str, int]:


domains = [urlparse(d.url).netloc for d in data]
return dict(Counter(domains).most_common(10))

def _analyze_sentiment(self, data: List[ScrapedData]) -> Dict[str, float]:


polarities = [d.metadata.get('sentiment_polarity', 0) for d in data]
subjectivities = [d.metadata.get('sentiment_subjectivity', 0) for d in
data]

return {
"avg_polarity": statistics.mean(polarities) if polarities else 0,
"avg_subjectivity": statistics.mean(subjectivities) if subjectivities
else 0
}

def _analyze_keywords(self, data: List[ScrapedData]) -> List[Tuple[str, int]]:


all_keywords = []
for d in data:
keywords = d.metadata.get('top_words', [])
all_keywords.extend([word for word, count in keywords])

return Counter(all_keywords).most_common(20)

def create_visualizations(self):
data = self.db_manager.get_all_data()

if not data:
print("No data available for visualization")
return

fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Word count distribution


word_counts = [d.metadata.get('word_count', 0) for d in data]
axes[0, 0].hist(word_counts, bins=20, alpha=0.7)
axes[0, 0].set_title('Word Count Distribution')
axes[0, 0].set_xlabel('Word Count')
axes[0, 0].set_ylabel('Frequency')

# Sentiment analysis
polarities = [d.metadata.get('sentiment_polarity', 0) for d in data]
axes[0, 1].hist(polarities, bins=20, alpha=0.7, color='green')
axes[0, 1].set_title('Sentiment Polarity Distribution')
axes[0, 1].set_xlabel('Polarity (-1 to 1)')
axes[0, 1].set_ylabel('Frequency')

# Reading ease scores


reading_scores = [d.metadata.get('flesch_reading_ease', 0) for d in data]
axes[1, 0].hist(reading_scores, bins=20, alpha=0.7, color='orange')
axes[1, 0].set_title('Reading Ease Scores')
axes[1, 0].set_xlabel('Flesch Reading Ease')
axes[1, 0].set_ylabel('Frequency')

# Domain distribution
domains = [urlparse(d.url).netloc for d in data]
domain_counts = Counter(domains).most_common(10)
if domain_counts:
domain_names, counts = zip(*domain_counts)
axes[1, 1].bar(range(len(domain_names)), counts)
axes[1, 1].set_title('Top Domains')
axes[1, 1].set_xlabel('Domain')
axes[1, 1].set_ylabel('Page Count')
axes[1, 1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

def main():
"""Demonstration of the web scraping framework"""
print("Web Scraping & Analytics Framework Demo")
print("=" * 50)

# Initialize scraper
scraper = WebScraper(rate_limit=(5, 60), use_proxy=False)

# Example URLs to scrape


urls = [
'https://httpbin.org/html',
'https://httpbin.org/json',
'https://httpbin.org/xml'
]

print(f"Scraping {len(urls)} URLs...")

# Scrape multiple URLs


results = scraper.scrape_multiple(urls, max_workers=3)

print(f"Successfully scraped {len(results)} pages")

# Generate analytics report


analyzer = DataAnalyzer(scraper.db_manager)
report = analyzer.generate_report()

print("\nAnalytics Report:")
print(json.dumps(report, indent=2, default=str))

# Export data to CSV


data = scraper.db_manager.get_all_data()
if data:
df = pd.DataFrame([{
'url': d.url,
'title': d.title,
'word_count': d.metadata.get('word_count', 0),
'sentiment': d.metadata.get('sentiment_polarity', 0),
'timestamp': d.timestamp
} for d in data])

df.to_csv('scraped_data.csv', index=False)
print(f"\nExported {len(df)} records to scraped_data.csv")

# Create visualizations (would work if matplotlib is available)


try:
analyzer.create_visualizations()
except Exception as e:
print(f"Visualization error: {e}")

if __name__ == "__main__":
main()

You might also like