0% found this document useful (0 votes)
17 views8 pages

Fixed by Chat GPT 2

Uploaded by

murad tatari
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
17 views8 pages

Fixed by Chat GPT 2

Uploaded by

murad tatari
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 8

import pandas as pd

import joblib

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score

import shutil

# Define file paths

data_file = '/mnt/data/urldata.csv'

features_file = '/mnt/data/features.csv'

labels_file = '/mnt/data/labels.csv'

import pandas as pd

import joblib

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score

import shutil

# Define file paths

data_file = '/mnt/data/urldata.csv'

features_file = '/mnt/data/features.csv'

labels_file = '/mnt/data/labels.csv'

import pandas as pd

import joblib
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score

import shutil

# Define file paths

data_file = '/mnt/data/urldata.csv'

features_file = '/mnt/data/features.csv'

labels_file = '/mnt/data/labels.csv'

import os

import pandas as pd

import joblib

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score

import shutil

# Define file paths

data_file = '/mnt/data/urldata.csv'

features_file = '/mnt/data/features.csv'

labels_file = '/mnt/data/labels.csv'

model_file = '/mnt/data/url_classifier.pkl'

script_file = '/mnt/data/detect_url.py'

archive_file = '/mnt/data/url_detection_project.rar'

# Load the dataset

data = pd.read_csv(data_file)
# Feature extraction function

def extract_features(url):

from urllib.parse import urlparse

import re

features = {}

# Parse URL

parsed_url = urlparse(url)

# Length Features

features['length_of_url'] = len(url)

features['length_of_hostname'] = len(parsed_url.hostname) if parsed_url.hostname else 0

features['length_of_path'] = len(parsed_url.path)

features['length_of_first_directory'] = len(parsed_url.path.split('/')[1]) if len(parsed_url.path.split('/'))


> 1 else 0

features['length_of_top_level_domain'] = len(parsed_url.hostname.split('.')[-1]) if
parsed_url.hostname else 0

# Count Features

features['count_of_-'] = url.count('-')

features['count_of_@'] = url.count('@')

features['count_of_?'] = url.count('?')

features['count_of_%'] = url.count('%')

features['count_of_.'] = url.count('.')

features['count_of_='] = url.count('=')

features['count_of_http'] = url.count('http')

features['count_of_www'] = url.count('www')

features['count_of_digits'] = sum(c.isdigit() for c in url)


features['count_of_letters'] = sum(c.isalpha() for c in url)

features['count_of_number_of_directories'] = len(parsed_url.path.split('/')) - 1

# Binary Features

features['use_of_ip'] = 1 if re.match(r'^(http|https)://\d+\.\d+\.\d+\.\d+', url) else 0

shortening_services = (

"bit\.ly", "goo\.gl", "shorte\.st", "t\.co", "tinyurl\.com", "tr\.im", "is\.gd", "cli\.gs", "yfrog\.com",

"migre\.me", "ff\.im", "tiny\.cc", "url4\.eu", "twit\.ac", "su\.pr", "twurl\.nl", "snipurl\.com",


"short\.to",

"BudURL\.com", "ping\.fm", "post\.ly", "Just\.as", "bkite\.com", "snipr\.com", "fic\.kr", "loopt\.us",

"doiop\.com", "short\.ie", "kl\.am", "wp\.me", "rubyurl\.com", "om\.ly", "to\.ly", "bit\.do",


"lnkd\.in",

"db\.tt", "qr\.ae", "adf\.ly", "bitly\.com", "cur\.lv", "ow\.ly", "ity\.im", "q\.gs", "po\.st", "bc\.vc",

"twitthis\.com", "u\.to", "j\.mp", "buzurl\.com", "cutt\.ly", "u\.bb", "yourls\.org", "t2mio\.com",


"v\.gd",

"tr\.im", "link\.zip\.net"

pattern = re.compile(r'|'.join(shortening_services))

features['use_of_shortening_url'] = 1 if pattern.search(url) else 0

return features

# Apply feature extraction

features = data['URL'].apply(extract_features)

features_df = pd.DataFrame(features.tolist())

data = pd.concat([data, features_df], axis=1)

# Separate features and labels

X = data.drop(columns=['URL', 'Label'])

y = data['Label']
# Save the features and labels for later use

X.to_csv(features_file, index=False)

y.to_csv(labels_file, index=False)

# Split the data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model

model = RandomForestClassifier(n_estimators=100, random_state=42)

model.fit(X_train, y_train)

# Evaluate the model

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')

# Save the model

joblib.dump(model, model_file)

# Create the detect_url.py script

script_content = """

import joblib

import argparse

import pandas as pd

from urllib.parse import urlparse

# Load the model

model = joblib.load('url_classifier.pkl')
# Function to extract features (reuse the extract_features function)

def extract_features(url):

from urllib.parse import urlparse

import re

features = {}

# Parse URL

parsed_url = urlparse(url)

# Length Features

features['length_of_url'] = len(url)

features['length_of_hostname'] = len(parsed_url.hostname) if parsed_url.hostname else 0

features['length_of_path'] = len(parsed_url.path)

features['length_of_first_directory'] = len(parsed_url.path.split('/')[1]) if len(parsed_url.path.split('/'))


> 1 else 0

features['length_of_top_level_domain'] = len(parsed_url.hostname.split('.')[-1]) if
parsed_url.hostname else 0

# Count Features

features['count_of_-'] = url.count('-')

features['count_of_@'] = url.count('@')

features['count_of_?'] = url.count('?')

features['count_of_%'] = url.count('%')

features['count_of_.'] = url.count('.')

features['count_of_='] = url.count('=')

features['count_of_http'] = url.count('http')

features['count_of_www'] = url.count('www')
features['count_of_digits'] = sum(c.isdigit() for c in url)

features['count_of_letters'] = sum(c.isalpha() for c in url)

features['count_of_number_of_directories'] = len(parsed_url.path.split('/')) - 1

# Binary Features

features['use_of_ip'] = 1 if re.match(r'^(http|https)://\\d+\\.\\d+\\.\\d+\\.\\d+', url) else 0

shortening_services = (

"bit\\.ly", "goo\\.gl", "shorte\\.st", "t\\.co", "tinyurl\\.com", "tr\\.im", "is\\.gd", "cli\\.gs",


"yfrog\\.com",

"migre\\.me", "ff\\.im", "tiny\\.cc", "url4\\.eu", "twit\\.ac", "su\\.pr", "twurl\\.nl", "snipurl\\.com",


"short\\.to",

"BudURL\\.com", "ping\\.fm", "post\\.ly", "Just\\.as", "bkite\\.com", "snipr\\.com", "fic\\.kr",


"loopt\\.us",

"doiop\\.com", "short\\.ie", "kl\\.am", "wp\\.me", "rubyurl\\.com", "om\\.ly", "to\\.ly", "bit\\.do",


"lnkd\\.in",

"db\\.tt", "qr\\.ae", "adf\\.ly", "bitly\\.com", "cur\\.lv", "ow\\.ly", "ity\\.im", "q\\.gs", "po\\.st",


"bc\\.vc",

"twitthis\\.com", "u\\.to", "j\\.mp", "buzurl\\.com", "cutt\\.ly", "u\\.bb", "yourls\\.org",


"t2mio\\.com", "v\\.gd",

"tr\\.im", "link\\.zip\\.net"

pattern = re.compile(r'|'.join(shortening_services))

features['use_of_shortening_url'] = 1 if pattern.search(url) else 0

return features

# Predict function

def predict_url(url):

features = extract_features(url)

features_df = pd.DataFrame([features])

prediction = model.predict(features_df)[0]
return 'Malicious' if prediction == 1 else 'Benign'

# Main function for CLI

if __name__ == "__main__":

parser = argparse.ArgumentParser(description='URL Maliciousness Detection')

parser.add_argument('url', type=str, help='URL to be classified')

args = parser.parse_args()

url = args.url

result = predict_url(url)

print(f'The URL is: {result}')

"""

# Write the script to a file

with open(script_file, 'w') as f:

f.write(script_content)

# Create a RAR archive with the necessary files

shutil.make_archive('/mnt/data/url_detection_project', 'rar', '/mnt/data', '.')

# Specify the final archive file path

archive_file = '/mnt/data/url_detection_project.rar'

archive_file ​:citation[【oaicite:0】]​

You might also like