Web Mining Lab Source Code 1-12 PRINT
Web Mining Lab Source Code 1-12 PRINT
import numpy as np
Parameters:
graph : 2D NumPy array
Adjacency matrix representing the links between pages (1 if page i links to page j, else
0).
d : float
Damping factor (usually 0.85).
max_iterations : int
Maximum number of iterations for the algorithm to converge.
tol : float
Tolerance for convergence.
Returns:
page_ranks : 1D NumPy array
The PageRank score for each page.
"""
page_ranks = new_page_ranks
return page_ranks
# Example usage:
if name == " main ":
# Define the adjacency matrix for the web graph
web_graph = np.array([[0, 1, 1, 0],
[1, 0, 0, 1],
[0, 1, 0, 1],
[0, 0, 1, 0]])
ranks = page_rank(web_graph)
print("Page ranks:", ranks)
Output:
Source Code:
import numpy as np
Parameters:
graph : 2D NumPy array
Adjacency matrix representing the links between pages (1 if page i links to page j, else
0).
d : float
Damping factor (usually 0.85).
max_iterations : int
Maximum number of iterations for the algorithm to converge.
tol : float
Tolerance for convergence.
Returns:
page_ranks : 1D NumPy array
The PageRank score for each page.
"""
page_ranks = new_page_ranks
return page_ranks
Parameters:
graph : 2D NumPy array
Adjacency matrix representing the links between pages.
page_ranks : 1D NumPy array
The PageRank scores for each
page. """
n = graph.shape[0]
# Example usage:
if name == " main ":
# Define the adjacency matrix for the web graph
web_graph = np.array([[0, 1, 1, 0],
[1, 0, 0, 1],
[0, 1, 0, 1],
[0, 0, 1, 0]])
# Required Libraries
import re
import requests
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag
import nltk
def
fetch_webpage(url):
"""
Fetches the webpage content from the URL.
"""
response = requests.get(url)
if response.status_code == 200:
return response.text
else:
print(f"Failed to fetch the webpage: Status code {response.status_code}")
return None
def
clean_html(raw_html):
"""
Cleans the raw HTML and extracts text content.
"""
soup = BeautifulSoup(raw_html, 'html.parser')
text = soup.get_text()
return text
def preprocess_text(text):
"""
Performs various text preprocessing steps including:
- Lowercasing
- Removing punctuation/numbers
- Tokenization
- Stopword removal
- Stemming/Lemmatization
"""
# Lowercase the text
text = text.lower()
# Remove stopwords
stop_words = set(stopwords.words('english'))
words = [word for word in words if word not in stop_words]
return words
def
get_wordnet_pos(treebank_tag):
"""
Helper function to convert POS tags to WordNet POS format for better lemmatization.
"""
if treebank_tag.startswith('J'):
return wordnet.ADJ
elif treebank_tag.startswith('V'):
return wordnet.VERB
elif treebank_tag.startswith('N'):
return wordnet.NOUN
elif treebank_tag.startswith('R'):
return wordnet.ADV
else:
return wordnet.NOUN
def preprocess_webpage(url):
"""
Fetches a webpage, cleans and preprocesses the text from
it. """
# Step 1: Fetch webpage
raw_html = fetch_webpage(url)
if raw_html:
# Step 2: Clean HTML and extract text
cleaned_text = clean_html(raw_html)
# Example usage:
if name == " main ":
# URL of the webpage to preprocess
url = "https://www.example.com"
import networkx as nx
import matplotlib.pyplot as plt
def create_social_network():
"""
Create a social network graph.
Nodes represent individuals, and edges represent relationships between them.
"""
G = nx.Graph()
return G
def plot_social_network(G):
"""
Plot the social network graph.
"""
plt.figure(figsize=(8, 8))
pos = nx.spring_layout(G) # Layout for the graph
nx.draw(G, pos, with_labels=True, node_color='skyblue', node_size=2000, font_size=15,
font_weight='bold', edge_color='gray')
plt.title('Social Network', fontsize=20)
plt.show()
def
calculate_centrality_measures(G):
"""
Calculate and print centrality measures for each node in the social network graph.
"""
# Degree Centrality
degree_centrality = nx.degree_centrality(G)
# Closeness Centrality
closeness_centrality = nx.closeness_centrality(G)
# Betweenness Centrality
betweenness_centrality = nx.betweenness_centrality(G)
print("\nCentrality Measures:")
for node in G.nodes():
print(f"\nNode: {node}")
print(f" Degree Centrality: {degree_centrality[node]:.4f}")
print(f" Closeness Centrality: {closeness_centrality[node]:.4f}")
print(f" Betweenness Centrality: {betweenness_centrality[node]:.4f}")
def analyze_social_network():
"""
Main function to create, visualize, and analyze the social
network. """
# Create the social network
G = create_social_network()
# Example usage
if name == " main ":
analyze_social_network()
Output:
Source Code:
TextBlob def
analyze_opinion(text):
"""
Perform sentiment analysis on the given text.
Parameters:
text (str): The input text to analyze.
Returns:
sentiment (str): The sentiment of the text (Positive, Negative, or Neutral).
polarity (float): The polarity score of the text (-1 to 1).
subjectivity (float): The subjectivity score of the text (0 to 1).
"""
# Create a TextBlob object
blob = TextBlob(text)
def analyze_opinions(text_list):
"""
Analyze sentiment for a list of texts.
Parameters:
text_list (list of str): List of texts to analyze.
Returns:
None
"""
for idx, text in enumerate(text_list):
sentiment, polarity, subjectivity = analyze_opinion(text)
print(f"\nText {idx+1}: {text}")
print(f" Sentiment: {sentiment}")
print(f" Polarity: {polarity:.4f}")
print(f" Subjectivity: {subjectivity:.4f}")
# Example usage
if name == " main ":
# List of texts (opinions or reviews)
texts = [
"I absolutely love this product! It works wonders.",
"The movie was okay, but I found it a bit too long.",
"I'm very disappointed with the service. It was
terrible.", "The weather is perfect today. I'm feeling
great.",
"The food was bland and tasteless. Not worth the money."
]
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
def analyze_sentiment(text):
"""
Analyze the sentiment of a given text using VADER.
Parameters:
text (str): The input text to analyze.
Returns:
dict: Sentiment scores (positive, neutral, negative, and compound).
"""
# Initialize the VADER sentiment intensity
analyzer sia = SentimentIntensityAnalyzer()
return sentiment_scores
def
analyze_texts(text_list):
"""
Analyze sentiment for a list of texts.
Parameters:
text_list (list of str): List of texts to analyze.
Returns:
None
"""
for idx, text in
enumerate(text_list): scores =
analyze_sentiment(text) print(f"\
nText {idx+1}: {text}")
print(f" Positive Score: {scores['pos']}")
print(f" Neutral Score: {scores['neu']}")
print(f" Negative Score: {scores['neg']}")
print(f" Compound Score: {scores['compound']:.4f}")
# Determine the overall sentiment based on the compound score
if scores['compound'] >= 0.05:
sentiment = "Positive"
elif scores['compound'] <= -0.05:
sentiment = "Negative"
else:
sentiment = "Neutral"
# Example usage
if name == " main ":
# List of sample texts (reviews, comments, or opinions)
texts = [
"I love this product! It works great.",
"This movie was terrible. I wasted my time.",
"The food was okay, but nothing special.",
"What a fantastic day! I'm so happy right now.",
"The service was really slow and disappointing."
]
import requests
from bs4 import BeautifulSoup
import time
import random
import urllib.robotparser
def
check_robots(url):
"""
Check if the URL is allowed to be scraped based on robots.txt.
Parameters:
url (str): The website URL to check.
Returns:
bool: True if scraping is allowed, False otherwise.
"""
parsed_url = urllib.parse.urlparse(url)
robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt"
rp = urllib.robotparser.RobotFileParser()
rp.set_url(robots_url)
rp.read()
return rp.can_fetch("*", url)
def scrape_web_content(url):
"""
Scrape web content while considering privacy regulations.
Parameters:
url (str): The website URL to scrape.
Returns:
str: The scraped content or a message indicating scraping is not allowed.
"""
if not check_robots(url):
return "Scraping is not allowed for this URL according to robots.txt."
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
try:
response = requests.get(url, headers=headers)
response.raise_for_status() # Raise an error for bad responses
return text_content
except requests.exceptions.RequestException as e:
return f"An error occurred: {e}"
def main():
# List of URLs to
scrape urls = [
"https://www.example.com",
"https://www.wikipedia.org",
# Add more URLs as needed
]
import pandas as pd
import random
from datetime import datetime, timedelta
Parameters:
num_logs (int): Number of sample logs to generate.
Returns:
list: List of sample access log entries.
"""
ip_addresses = [f"192.168.1.{i}" for i in range(1, 21)] # Sample IP addresses
paths = ["/home", "/about", "/contact", "/products", "/services"]
logs = []
for _ in range(num_logs):
ip = random.choice(ip_addresses)
timestamp = datetime.now() - timedelta(minutes=random.randint(1, 60)) # Random
timestamp within the last hour
path = random.choice(paths)
logs.append((ip, timestamp, f"GET {path} HTTP/1.1"))
return logs
def create_log_dataframe(logs):
"""
Create a DataFrame from the generated logs.
Parameters:
logs (list): List of log entries.
Returns:
pd.DataFrame: DataFrame containing the access
logs. """
return pd.DataFrame(logs, columns=['IP', 'Timestamp', 'Request'])
def
extract_page_views(log_df):
"""
Extract the most visited pages from the access logs.
Parameters:
log_df (pd.DataFrame): DataFrame containing the access logs.
Returns:
pd.DataFrame: DataFrame with page view counts.
"""
log_df['URL'] = log_df['Request'].str.split(' ').str[1] # Extract URL from the request
page_views = log_df['URL'].value_counts().reset_index()
page_views.columns = ['URL', 'Count']
return page_views
def
analyze_sessions(log_df):
"""
Analyze user sessions based on IP address and timestamps.
Parameters:
log_df (pd.DataFrame): DataFrame containing the access logs.
Returns:
pd.DataFrame: DataFrame with session information.
"""
session_df = log_df.sort_values(by=['IP', 'Timestamp'])
session_df['SessionID'] = (session_df['Timestamp'].diff() >
pd.Timedelta(minutes=30)).cumsum() # Identify sessions
session_count = session_df.groupby(['IP',
'SessionID']).size().reset_index(name='SessionCount')
return session_count
def main():
# Generate sample access logs
num_logs = 100 # Number of sample logs to generate
sample_logs = generate_sample_logs(num_logs)
import pandas as
pd import numpy
as np
# Create a DataFrame
df = pd.DataFrame(data)
# Calculate similarity
similarity = cosine_similarity(matrix)
def main():
user = 'Alice' # Specify the user for recommendations
recommendations = recommend_items(user, user_item_matrix)
import networkx as nx
import matplotlib.pyplot as plt
def create_web_graph(data):
"""Create a directed graph from web
data.""" G = nx.DiGraph() # Create a
directed graph for page, links in
data.items():
for link in links:
G.add_edge(page, link) # Add edges for links
return G
def analyze_graph(G):
"""Analyze the web graph to find the most linked pages."""
# Calculate in-degree (number of incoming links)
in_degrees = G.in_degree()
sorted_pages = sorted(in_degrees, key=lambda x: x[1], reverse=True)
return sorted_pages
def plot_graph(G):
"""Visualize the web structure graph."""
plt.figure(figsize=(8, 6))
pos = nx.spring_layout(G)
nx.draw(G, pos, with_labels=True, node_color='lightblue', node_size=2000, font_size=10,
font_weight='bold', arrows=True)
plt.title("Web Structure
Graph") plt.show()
def main():
# Create the web graph from the sample data
web_graph = create_web_graph(web_data)
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# Get the pairwise similarity scores of all movies with that movie
sim_scores = list(enumerate(cosine_sim[idx]))
def main():
# Example: Recommend movies similar to 'The Matrix'
movie_name = 'The Matrix'
recommendations = get_recommendations(movie_name)
import networkx as nx
import matplotlib.pyplot as plt
return G
return pagerank
def main():
# Create the initial web graph
web_graph =
create_initial_web_graph()