0% found this document useful (0 votes)

5 views10 pages

Ballerono Cappuchino

The document outlines a comprehensive web scraping framework using Python, incorporating various libraries such as BeautifulSoup, requests, and SQLite for data management. It features classes for scraping, content extraction, text analysis, and data storage, along with rate limiting and proxy rotation for efficient scraping. Additionally, it includes functionalities for data analysis and visualization, providing insights into the scraped content.

Uploaded by

andreas.bogossian9

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

5 views10 pages

Ballerono Cappuchino

Uploaded by

andreas.bogossian9

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

You are on page 1/ 10

import asyncio

import aiohttp
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
import json
import csv
import time
from datetime import datetime, timedelta
from urllib.parse import urljoin, urlparse, parse_qs
from dataclasses import dataclass, asdict
from typing import List, Dict, Optional, Set, Callable, Any, Tuple
import hashlib
import sqlite3
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
import threading
import queue
import logging
from fake_useragent import UserAgent
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter, defaultdict
import nltk
from textblob import TextBlob
import statistics

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %
(message)s')
logger = logging.getLogger(__name__)

@dataclass
class ScrapedData:
url: str
title: str
content: str
metadata: Dict[str, Any]
timestamp: datetime
hash_id: str

def __post_init__(self):
if not self.hash_id:
self.hash_id = hashlib.md5(f"{self.url}
{self.content}".encode()).hexdigest()

class RateLimiter:
def __init__(self, max_requests: int, time_window: int):
self.max_requests = max_requests
self.time_window = time_window
self.requests = []
self.lock = threading.Lock()

def acquire(self):
with self.lock:
now = time.time()
# Remove old requests outside the time window
self.requests = [req_time for req_time in self.requests if now -
req_time < self.time_window]

if len(self.requests) >= self.max_requests:

sleep_time = self.time_window - (now - self.requests[0])
if sleep_time > 0:
time.sleep(sleep_time)
return self.acquire()

self.requests.append(now)
return True

class ProxyRotator:
def __init__(self, proxy_list: List[str]):
self.proxies = proxy_list
self.current_index = 0
self.failed_proxies = set()
self.lock = threading.Lock()

def get_proxy(self) -> Optional[str]:

with self.lock:
if len(self.failed_proxies) >= len(self.proxies):
return None

for _ in range(len(self.proxies)):
proxy = self.proxies[self.current_index]
self.current_index = (self.current_index + 1) % len(self.proxies)

if proxy not in self.failed_proxies:

return proxy

return None

def mark_failed(self, proxy: str):

with self.lock:
self.failed_proxies.add(proxy)

class ContentExtractor:
def __init__(self):
self.selectors = {
'title': ['h1', 'title', '.title', '#title', '[class*="title"]'],
'content': ['p', '.content', '.article', '.post',
'[class*="content"]'],
'links': ['a[href]'],
'images': ['img[src]'],
'meta_description': ['meta[name="description"]'],
'meta_keywords': ['meta[name="keywords"]']
}

def extract_text(self, soup: BeautifulSoup, element_type: str) -> List[str]:

elements = []
for selector in self.selectors.get(element_type, []):
found = soup.select(selector)
elements.extend([elem.get_text(strip=True) for elem in found if
elem.get_text(strip=True)])
return elements

def extract_links(self, soup: BeautifulSoup, base_url: str) -> List[Dict[str,

str]]:
links = []
for link in soup.find_all('a', href=True):
href = link['href']
absolute_url = urljoin(base_url, href)
links.append({
'url': absolute_url,
'text': link.get_text(strip=True),
'title': link.get('title', '')
})
return links

def extract_metadata(self, soup: BeautifulSoup) -> Dict[str, Any]:

metadata = {}

# Meta tags
for meta in soup.find_all('meta'):
name = meta.get('name') or meta.get('property')
content = meta.get('content')
if name and content:
metadata[name] = content

# Page statistics
metadata['word_count'] = len(soup.get_text().split())
metadata['link_count'] = len(soup.find_all('a'))
metadata['image_count'] = len(soup.find_all('img'))
metadata['heading_count'] = len(soup.find_all(['h1', 'h2', 'h3', 'h4',
'h5', 'h6']))

return metadata

class TextAnalyzer:
def __init__(self):
# Download required NLTK data
try:
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('vader_lexicon', quiet=True)
except:
pass

def analyze_text(self, text: str) -> Dict[str, Any]:

analysis = {}

# Basic statistics
words = text.split()
analysis['word_count'] = len(words)
analysis['char_count'] = len(text)
analysis['sentence_count'] = len(re.split(r'[.!?]+', text))
analysis['avg_word_length'] = np.mean([len(word) for word in words]) if
words else 0

# Readability metrics
analysis['flesch_reading_ease'] = self.calculate_flesch_score(text)

# Sentiment analysis
try:
blob = TextBlob(text)
analysis['sentiment_polarity'] = blob.sentiment.polarity
analysis['sentiment_subjectivity'] = blob.sentiment.subjectivity
except:
analysis['sentiment_polarity'] = 0
analysis['sentiment_subjectivity'] = 0

# Keyword extraction
analysis['top_words'] = self.extract_keywords(text, top_n=10)

return analysis

def calculate_flesch_score(self, text: str) -> float:

sentences = len(re.split(r'[.!?]+', text))
words = len(text.split())
syllables = sum([self.count_syllables(word) for word in text.split()])

if sentences == 0 or words == 0:
return 0

score = 206.835 - (1.015 * (words / sentences)) - (84.6 * (syllables /

words))
return max(0, min(100, score))

def count_syllables(self, word: str) -> int:

word = word.lower()
vowels = "aeiouy"
syllable_count = 0
previous_char_was_vowel = False

for char in word:

is_vowel = char in vowels
if is_vowel and not previous_char_was_vowel:
syllable_count += 1
previous_char_was_vowel = is_vowel

if word.endswith('e'):
syllable_count -= 1

return max(1, syllable_count)

def extract_keywords(self, text: str, top_n: int = 10) -> List[Tuple[str,

int]]:
# Simple keyword extraction based on frequency
words = re.findall(r'\b[a-zA-Z]{3,}\b', text.lower())

# Remove common stop words

stop_words = {'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
'of', 'with', 'by', 'this', 'that', 'these', 'those'}
words = [word for word in words if word not in stop_words]

word_freq = Counter(words)
return word_freq.most_common(top_n)

class DatabaseManager:
def __init__(self, db_path: str = "scraping_data.db"):
self.db_path = db_path
self.init_database()

def init_database(self):
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
cursor.execute('''
CREATE TABLE IF NOT EXISTS scraped_data (
id INTEGER PRIMARY KEY AUTOINCREMENT,
url TEXT UNIQUE,
title TEXT,
content TEXT,
metadata TEXT,
timestamp TEXT,
hash_id TEXT UNIQUE
)
''')

cursor.execute('''
CREATE TABLE IF NOT EXISTS analytics (
id INTEGER PRIMARY KEY AUTOINCREMENT,
data_id INTEGER,
analysis TEXT,
created_at TEXT,
FOREIGN KEY (data_id) REFERENCES scraped_data (id)
)
''')

conn.commit()
conn.close()

def save_data(self, data: ScrapedData) -> bool:

try:
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()

cursor.execute('''
INSERT OR REPLACE INTO scraped_data
(url, title, content, metadata, timestamp, hash_id)
VALUES (?, ?, ?, ?, ?, ?)
''', (
data.url,
data.title,
data.content,
json.dumps(data.metadata),
data.timestamp.isoformat(),
data.hash_id
))

conn.commit()
conn.close()
return True
except Exception as e:
logger.error(f"Database save error: {e}")
return False

def get_all_data(self) -> List[ScrapedData]:

conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()

cursor.execute('SELECT * FROM scraped_data')

rows = cursor.fetchall()

data_list = []
for row in rows:
data = ScrapedData(
url=row[1],
title=row[2],
content=row[3],
metadata=json.loads(row[4]),
timestamp=datetime.fromisoformat(row[5]),
hash_id=row[6]
)
data_list.append(data)

conn.close()
return data_list

class WebScraper:
def __init__(self, rate_limit: Tuple[int, int] = (10, 60), use_proxy: bool =
False):
self.rate_limiter = RateLimiter(rate_limit[0], rate_limit[1])
self.content_extractor = ContentExtractor()
self.text_analyzer = TextAnalyzer()
self.db_manager = DatabaseManager()
self.session = requests.Session()
self.user_agent = UserAgent()
self.proxy_rotator = None

if use_proxy:
# Example proxy list - in practice, you'd load from a file or service
proxy_list = ['http://proxy1:8080', 'http://proxy2:8080']
self.proxy_rotator = ProxyRotator(proxy_list)

def scrape_url(self, url: str, extract_links: bool = False) ->

Optional[ScrapedData]:
try:
self.rate_limiter.acquire()

headers = {'User-Agent': self.user_agent.random}

proxies = None

if self.proxy_rotator:
proxy = self.proxy_rotator.get_proxy()
if proxy:
proxies = {'http': proxy, 'https': proxy}

response = self.session.get(url, headers=headers, proxies=proxies,

timeout=10)
response.raise_for_status()

soup = BeautifulSoup(response.content, 'html.parser')

# Extract content
title_elements = self.content_extractor.extract_text(soup, 'title')
title = title_elements[0] if title_elements else ''

content_elements = self.content_extractor.extract_text(soup, 'content')

content = ' '.join(content_elements)

# Extract metadata
metadata = self.content_extractor.extract_metadata(soup)
metadata['status_code'] = response.status_code
metadata['content_type'] = response.headers.get('content-type', '')
if extract_links:
metadata['links'] = self.content_extractor.extract_links(soup, url)

# Analyze text
text_analysis = self.text_analyzer.analyze_text(content)
metadata.update(text_analysis)

# Create scraped data object

scraped_data = ScrapedData(
url=url,
title=title,
content=content,
metadata=metadata,
timestamp=datetime.now(),
hash_id=''
)

# Save to database
self.db_manager.save_data(scraped_data)

logger.info(f"Successfully scraped: {url}")

return scraped_data

except Exception as e:
logger.error(f"Error scraping {url}: {e}")
if self.proxy_rotator and proxies:
self.proxy_rotator.mark_failed(proxies['http'])
return None

def scrape_multiple(self, urls: List[str], max_workers: int = 5) ->

List[ScrapedData]:
results = []

with ThreadPoolExecutor(max_workers=max_workers) as executor:

future_to_url = {executor.submit(self.scrape_url, url): url for url in
urls}

for future in future_to_url:

try:
result = future.result(timeout=30)
if result:
results.append(result)
except Exception as e:
logger.error(f"Error in thread execution: {e}")

return results

def crawl_website(self, start_url: str, max_depth: int = 2, max_pages: int =

50) -> List[ScrapedData]:
visited = set()
to_visit = [(start_url, 0)] # (url, depth)
results = []

while to_visit and len(results) < max_pages:

current_url, depth = to_visit.pop(0)

if current_url in visited or depth > max_depth:

continue
visited.add(current_url)
scraped_data = self.scrape_url(current_url, extract_links=True)

if scraped_data:
results.append(scraped_data)

# Add new links to crawl

if depth < max_depth and 'links' in scraped_data.metadata:
domain = urlparse(start_url).netloc
for link in scraped_data.metadata['links']:
link_url = link['url']
link_domain = urlparse(link_url).netloc

# Only crawl links from the same domain

if link_domain == domain and link_url not in visited:
to_visit.append((link_url, depth + 1))

return results

class DataAnalyzer:
def __init__(self, db_manager: DatabaseManager):
self.db_manager = db_manager

def generate_report(self) -> Dict[str, Any]:

data = self.db_manager.get_all_data()

if not data:
return {"error": "No data available"}

report = {
"total_pages": len(data),
"date_range": {
"start": min(d.timestamp for d in data).isoformat(),
"end": max(d.timestamp for d in data).isoformat()
},
"content_stats": self._analyze_content(data),
"domain_analysis": self._analyze_domains(data),
"sentiment_analysis": self._analyze_sentiment(data),
"keyword_analysis": self._analyze_keywords(data)
}

return report

def _analyze_content(self, data: List[ScrapedData]) -> Dict[str, Any]:

word_counts = [d.metadata.get('word_count', 0) for d in data]
reading_scores = [d.metadata.get('flesch_reading_ease', 0) for d in data]

return {
"avg_word_count": statistics.mean(word_counts) if word_counts else 0,
"median_word_count": statistics.median(word_counts) if word_counts else
0,
"avg_reading_ease": statistics.mean(reading_scores) if reading_scores
else 0,
"total_content_length": sum(len(d.content) for d in data)
}

def _analyze_domains(self, data: List[ScrapedData]) -> Dict[str, int]:

domains = [urlparse(d.url).netloc for d in data]
return dict(Counter(domains).most_common(10))

def _analyze_sentiment(self, data: List[ScrapedData]) -> Dict[str, float]:

polarities = [d.metadata.get('sentiment_polarity', 0) for d in data]
subjectivities = [d.metadata.get('sentiment_subjectivity', 0) for d in
data]

return {
"avg_polarity": statistics.mean(polarities) if polarities else 0,
"avg_subjectivity": statistics.mean(subjectivities) if subjectivities
else 0
}

def _analyze_keywords(self, data: List[ScrapedData]) -> List[Tuple[str, int]]:

all_keywords = []
for d in data:
keywords = d.metadata.get('top_words', [])
all_keywords.extend([word for word, count in keywords])

return Counter(all_keywords).most_common(20)

def create_visualizations(self):
data = self.db_manager.get_all_data()

if not data:
print("No data available for visualization")
return

fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Word count distribution

word_counts = [d.metadata.get('word_count', 0) for d in data]
axes[0, 0].hist(word_counts, bins=20, alpha=0.7)
axes[0, 0].set_title('Word Count Distribution')
axes[0, 0].set_xlabel('Word Count')
axes[0, 0].set_ylabel('Frequency')

# Sentiment analysis
polarities = [d.metadata.get('sentiment_polarity', 0) for d in data]
axes[0, 1].hist(polarities, bins=20, alpha=0.7, color='green')
axes[0, 1].set_title('Sentiment Polarity Distribution')
axes[0, 1].set_xlabel('Polarity (-1 to 1)')
axes[0, 1].set_ylabel('Frequency')

# Reading ease scores

reading_scores = [d.metadata.get('flesch_reading_ease', 0) for d in data]
axes[1, 0].hist(reading_scores, bins=20, alpha=0.7, color='orange')
axes[1, 0].set_title('Reading Ease Scores')
axes[1, 0].set_xlabel('Flesch Reading Ease')
axes[1, 0].set_ylabel('Frequency')

# Domain distribution
domains = [urlparse(d.url).netloc for d in data]
domain_counts = Counter(domains).most_common(10)
if domain_counts:
domain_names, counts = zip(*domain_counts)
axes[1, 1].bar(range(len(domain_names)), counts)
axes[1, 1].set_title('Top Domains')
axes[1, 1].set_xlabel('Domain')
axes[1, 1].set_ylabel('Page Count')
axes[1, 1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

def main():
"""Demonstration of the web scraping framework"""
print("Web Scraping & Analytics Framework Demo")
print("=" * 50)

# Initialize scraper
scraper = WebScraper(rate_limit=(5, 60), use_proxy=False)

# Example URLs to scrape

urls = [
'https://httpbin.org/html',
'https://httpbin.org/json',
'https://httpbin.org/xml'
]

print(f"Scraping {len(urls)} URLs...")

# Scrape multiple URLs

results = scraper.scrape_multiple(urls, max_workers=3)

print(f"Successfully scraped {len(results)} pages")

# Generate analytics report

analyzer = DataAnalyzer(scraper.db_manager)
report = analyzer.generate_report()

print("\nAnalytics Report:")
print(json.dumps(report, indent=2, default=str))

# Export data to CSV

data = scraper.db_manager.get_all_data()
if data:
df = pd.DataFrame([{
'url': d.url,
'title': d.title,
'word_count': d.metadata.get('word_count', 0),
'sentiment': d.metadata.get('sentiment_polarity', 0),
'timestamp': d.timestamp
} for d in data])

df.to_csv('scraped_data.csv', index=False)
print(f"\nExported {len(df)} records to scraped_data.csv")

# Create visualizations (would work if matplotlib is available)

try:
analyzer.create_visualizations()
except Exception as e:
print(f"Visualization error: {e}")

if __name__ == "__main__":
main()

SentinelOne Agent Troubleshooting
No ratings yet
SentinelOne Agent Troubleshooting
35 pages
List of 366 Proxy Websites
No ratings yet
List of 366 Proxy Websites
10 pages
Python Cheat Sheet - The Basics Coursera
No ratings yet
Python Cheat Sheet - The Basics Coursera
2 pages
Docu92034 - Dell EMC GeoDrive 2.0 User Guide PDF
No ratings yet
Docu92034 - Dell EMC GeoDrive 2.0 User Guide PDF
96 pages
SWIFT Mapping Security Controls
100% (2)
SWIFT Mapping Security Controls
22 pages
Cs 3308 Unit 7 Programming Assignment
No ratings yet
Cs 3308 Unit 7 Programming Assignment
8 pages
Information Retrieval WA
No ratings yet
Information Retrieval WA
9 pages
Assessment - 2: - K Mary Nikitha
No ratings yet
Assessment - 2: - K Mary Nikitha
27 pages
CS 3308 Programming Assignment Unit 4
No ratings yet
CS 3308 Programming Assignment Unit 4
7 pages
Hybrid Scraping Techniques
No ratings yet
Hybrid Scraping Techniques
8 pages
Sahil Malhotra 16 BCE 0113 Web Mining L51+L52: 1. Universal Crawling 1.1. CODE
No ratings yet
Sahil Malhotra 16 BCE 0113 Web Mining L51+L52: 1. Universal Crawling 1.1. CODE
11 pages
De1 GK NHKTLT
No ratings yet
De1 GK NHKTLT
12 pages
Assignment 4
No ratings yet
Assignment 4
11 pages
Osintgram
No ratings yet
Osintgram
28 pages
Python v3 URL and Page
No ratings yet
Python v3 URL and Page
4 pages
Sans Titre
No ratings yet
Sans Titre
11 pages
Azure Rag Implementation Part2
No ratings yet
Azure Rag Implementation Part2
15 pages
77 Final
No ratings yet
77 Final
24 pages
CS 3308 Programming Assignment Unit 2
No ratings yet
CS 3308 Programming Assignment Unit 2
10 pages
Ai Scraping Techniques
No ratings yet
Ai Scraping Techniques
9 pages
Profound Python Data Science
From Everand
Profound Python Data Science
Onder Teker
No ratings yet
Python Cheat Sheet - The Basics CC
No ratings yet
Python Cheat Sheet - The Basics CC
2 pages
Problem Statement
No ratings yet
Problem Statement
10 pages
Codesrepl
No ratings yet
Codesrepl
16 pages
Basic Scraping Techniques
No ratings yet
Basic Scraping Techniques
7 pages
卂几ㄖ几ㄚ
No ratings yet
卂几ㄖ几ㄚ
8 pages
Message
No ratings yet
Message
3 pages
Message
No ratings yet
Message
3 pages
The Essential R Reference
From Everand
The Essential R Reference
Mark Gardener
No ratings yet
Python Cheat Sheet - The Basics Coursera
No ratings yet
Python Cheat Sheet - The Basics Coursera
2 pages
Message 12 3
No ratings yet
Message 12 3
10 pages
Untitled Document
No ratings yet
Untitled Document
18 pages
Python Idioms
100% (1)
Python Idioms
72 pages
A Simple Python Web Crawler...
100% (1)
A Simple Python Web Crawler...
5 pages
Notes App Python
No ratings yet
Notes App Python
9 pages
Parser
No ratings yet
Parser
6 pages
PY0101 - Python For Data Science, AI, & Development Cheat Sheet
No ratings yet
PY0101 - Python For Data Science, AI, & Development Cheat Sheet
2 pages
Python Exp 10
No ratings yet
Python Exp 10
2 pages
Another Hack Test3
No ratings yet
Another Hack Test3
4 pages
PDF Processor
No ratings yet
PDF Processor
4 pages
1a NLTK
No ratings yet
1a NLTK
10 pages
Index
No ratings yet
Index
6 pages
1ffc727e-c2bf-45d2-8e73-e5f6fa056705
No ratings yet
1ffc727e-c2bf-45d2-8e73-e5f6fa056705
10 pages
Web Scraping
No ratings yet
Web Scraping
11 pages
Lab Manual
No ratings yet
Lab Manual
10 pages
4aeee7-Ba25-Ff2e-30d7-63d306a7270 Open Ai Playground Example Prompts - Google Sheets
No ratings yet
4aeee7-Ba25-Ff2e-30d7-63d306a7270 Open Ai Playground Example Prompts - Google Sheets
8 pages
Allcodes
No ratings yet
Allcodes
36 pages
Python Cheat Sheet - The Basics Edx
No ratings yet
Python Cheat Sheet - The Basics Edx
2 pages
Python Notes
No ratings yet
Python Notes
11 pages
Trip Planner Example
No ratings yet
Trip Planner Example
7 pages
Vanessaa Wim
No ratings yet
Vanessaa Wim
9 pages
Experiment2 Web Scraping and Data Analysis
No ratings yet
Experiment2 Web Scraping and Data Analysis
5 pages
77 Main 2
No ratings yet
77 Main 2
13 pages
Untitled Document
No ratings yet
Untitled Document
81 pages
Agoravai
No ratings yet
Agoravai
3 pages
Basic Performance Optimization in Django by Ryley Sill Medium
No ratings yet
Basic Performance Optimization in Django by Ryley Sill Medium
30 pages
British Airways Forage Report
No ratings yet
British Airways Forage Report
12 pages
Tool
No ratings yet
Tool
3 pages
Python File
No ratings yet
Python File
11 pages
10 Lessons in Front-end
From Everand
10 Lessons in Front-end
Krasimir Tsonev
2/5 (1)
Introduction to PHP, Part 2, Second Edition
From Everand
Introduction to PHP, Part 2, Second Edition
Adam Majczak
No ratings yet
Advanced C Concepts and Programming: First Edition
From Everand
Advanced C Concepts and Programming: First Edition
Gayatri
3/5 (1)
Python For Beginners
From Everand
Python For Beginners
Célio Azevedo
No ratings yet
Simplifying Data Science With Python
From Everand
Simplifying Data Science With Python
Billy David millican
No ratings yet
Apigee - RFP Template For API Management
100% (1)
Apigee - RFP Template For API Management
24 pages
Software Developers Guide
No ratings yet
Software Developers Guide
103 pages
Hacking
No ratings yet
Hacking
20 pages
WAS Lab Manual - Full
No ratings yet
WAS Lab Manual - Full
58 pages
Apc Inrow Acrc301s User Guide
No ratings yet
Apc Inrow Acrc301s User Guide
76 pages
Accelerate 2019, 4th April - AudioCodes' Solutions For Microsoft, Ran Inbar
No ratings yet
Accelerate 2019, 4th April - AudioCodes' Solutions For Microsoft, Ran Inbar
46 pages
Unit-III - E-Commerce and Its Application
100% (1)
Unit-III - E-Commerce and Its Application
90 pages
FortiOS-7.4.5-Administration - Guide (2) - Parte1
No ratings yet
FortiOS-7.4.5-Administration - Guide (2) - Parte1
300 pages
1Y0-231 Netscaler 2024 Update
No ratings yet
1Y0-231 Netscaler 2024 Update
2 pages
RESTful Web Services
No ratings yet
RESTful Web Services
24 pages
C CPI 14-Integration Suite
No ratings yet
C CPI 14-Integration Suite
15 pages
Design Use Case
No ratings yet
Design Use Case
70 pages
I Twin Limitless Pendrive Technology Full Seminar Report and
50% (2)
I Twin Limitless Pendrive Technology Full Seminar Report and
23 pages
Veritas Netbackup For Microsoft Azure Stack Administrator'S Guide
No ratings yet
Veritas Netbackup For Microsoft Azure Stack Administrator'S Guide
50 pages
Sec Brksec-3303 PDF
100% (1)
Sec Brksec-3303 PDF
91 pages
SAP Stress Test With
No ratings yet
SAP Stress Test With
29 pages
Fortiweb v6.1.0 Admin Guide PDF
No ratings yet
Fortiweb v6.1.0 Admin Guide PDF
862 pages
CM9.4 ElasticsearchInstall Config PDF
No ratings yet
CM9.4 ElasticsearchInstall Config PDF
29 pages
Advanced Computer Networks (CS ZG525) : BITS Pilani
No ratings yet
Advanced Computer Networks (CS ZG525) : BITS Pilani
35 pages
How To Install GBOX
No ratings yet
How To Install GBOX
3 pages
Yahoo! Communities Architectures: Ian Flint
No ratings yet
Yahoo! Communities Architectures: Ian Flint
26 pages
Amazon Workspaces: Administration Guide
No ratings yet
Amazon Workspaces: Administration Guide
75 pages
Palo Alto Networks - Content-ID Tech Brief
No ratings yet
Palo Alto Networks - Content-ID Tech Brief
5 pages
Dive Into Android Networking - Adding Ethernet Connectivity
No ratings yet
Dive Into Android Networking - Adding Ethernet Connectivity
40 pages
Hotspot Mikrotik 2.8
No ratings yet
Hotspot Mikrotik 2.8
13 pages
EWON (Remote Access For Mitsubishi PLCS)
No ratings yet
EWON (Remote Access For Mitsubishi PLCS)
50 pages

Ballerono Cappuchino

Uploaded by

Ballerono Cappuchino

Uploaded by

import asyncio

if len(self.requests) >= self.max_requests:

def get_proxy(self) -> Optional[str]:

if proxy not in self.failed_proxies:

def mark_failed(self, proxy: str):

def extract_text(self, soup: BeautifulSoup, element_type: str) -> List[str]:

def extract_links(self, soup: BeautifulSoup, base_url: str) -> List[Dict[str,

def extract_metadata(self, soup: BeautifulSoup) -> Dict[str, Any]:

def analyze_text(self, text: str) -> Dict[str, Any]:

def calculate_flesch_score(self, text: str) -> float:

score = 206.835 - (1.015 * (words / sentences)) - (84.6 * (syllables /

def count_syllables(self, word: str) -> int:

for char in word:

return max(1, syllable_count)

def extract_keywords(self, text: str, top_n: int = 10) -> List[Tuple[str,

# Remove common stop words

def save_data(self, data: ScrapedData) -> bool:

def get_all_data(self) -> List[ScrapedData]:

cursor.execute('SELECT * FROM scraped_data')

def scrape_url(self, url: str, extract_links: bool = False) ->

headers = {'User-Agent': self.user_agent.random}

response = self.session.get(url, headers=headers, proxies=proxies,

soup = BeautifulSoup(response.content, 'html.parser')

content_elements = self.content_extractor.extract_text(soup, 'content')

# Create scraped data object

logger.info(f"Successfully scraped: {url}")

def scrape_multiple(self, urls: List[str], max_workers: int = 5) ->

with ThreadPoolExecutor(max_workers=max_workers) as executor:

for future in future_to_url:

def crawl_website(self, start_url: str, max_depth: int = 2, max_pages: int =

while to_visit and len(results) < max_pages:

if current_url in visited or depth > max_depth:

# Add new links to crawl

# Only crawl links from the same domain

def generate_report(self) -> Dict[str, Any]:

def _analyze_content(self, data: List[ScrapedData]) -> Dict[str, Any]:

def _analyze_domains(self, data: List[ScrapedData]) -> Dict[str, int]:

def _analyze_sentiment(self, data: List[ScrapedData]) -> Dict[str, float]:

def _analyze_keywords(self, data: List[ScrapedData]) -> List[Tuple[str, int]]:

fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Word count distribution

# Reading ease scores

# Example URLs to scrape

print(f"Scraping {len(urls)} URLs...")

# Scrape multiple URLs

print(f"Successfully scraped {len(results)} pages")

# Generate analytics report

# Export data to CSV

# Create visualizations (would work if matplotlib is available)

You might also like