Fixed by Chat GPT 2
Fixed by Chat GPT 2
import joblib
import shutil
data_file = '/mnt/data/urldata.csv'
features_file = '/mnt/data/features.csv'
labels_file = '/mnt/data/labels.csv'
import pandas as pd
import joblib
import shutil
data_file = '/mnt/data/urldata.csv'
features_file = '/mnt/data/features.csv'
labels_file = '/mnt/data/labels.csv'
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
import shutil
data_file = '/mnt/data/urldata.csv'
features_file = '/mnt/data/features.csv'
labels_file = '/mnt/data/labels.csv'
import os
import pandas as pd
import joblib
import shutil
data_file = '/mnt/data/urldata.csv'
features_file = '/mnt/data/features.csv'
labels_file = '/mnt/data/labels.csv'
model_file = '/mnt/data/url_classifier.pkl'
script_file = '/mnt/data/detect_url.py'
archive_file = '/mnt/data/url_detection_project.rar'
data = pd.read_csv(data_file)
# Feature extraction function
def extract_features(url):
import re
features = {}
# Parse URL
parsed_url = urlparse(url)
# Length Features
features['length_of_url'] = len(url)
features['length_of_path'] = len(parsed_url.path)
features['length_of_top_level_domain'] = len(parsed_url.hostname.split('.')[-1]) if
parsed_url.hostname else 0
# Count Features
features['count_of_-'] = url.count('-')
features['count_of_@'] = url.count('@')
features['count_of_?'] = url.count('?')
features['count_of_%'] = url.count('%')
features['count_of_.'] = url.count('.')
features['count_of_='] = url.count('=')
features['count_of_http'] = url.count('http')
features['count_of_www'] = url.count('www')
features['count_of_number_of_directories'] = len(parsed_url.path.split('/')) - 1
# Binary Features
shortening_services = (
"db\.tt", "qr\.ae", "adf\.ly", "bitly\.com", "cur\.lv", "ow\.ly", "ity\.im", "q\.gs", "po\.st", "bc\.vc",
"tr\.im", "link\.zip\.net"
pattern = re.compile(r'|'.join(shortening_services))
return features
features = data['URL'].apply(extract_features)
features_df = pd.DataFrame(features.tolist())
X = data.drop(columns=['URL', 'Label'])
y = data['Label']
# Save the features and labels for later use
X.to_csv(features_file, index=False)
y.to_csv(labels_file, index=False)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f'Accuracy: {accuracy}')
joblib.dump(model, model_file)
script_content = """
import joblib
import argparse
import pandas as pd
model = joblib.load('url_classifier.pkl')
# Function to extract features (reuse the extract_features function)
def extract_features(url):
import re
features = {}
# Parse URL
parsed_url = urlparse(url)
# Length Features
features['length_of_url'] = len(url)
features['length_of_path'] = len(parsed_url.path)
features['length_of_top_level_domain'] = len(parsed_url.hostname.split('.')[-1]) if
parsed_url.hostname else 0
# Count Features
features['count_of_-'] = url.count('-')
features['count_of_@'] = url.count('@')
features['count_of_?'] = url.count('?')
features['count_of_%'] = url.count('%')
features['count_of_.'] = url.count('.')
features['count_of_='] = url.count('=')
features['count_of_http'] = url.count('http')
features['count_of_www'] = url.count('www')
features['count_of_digits'] = sum(c.isdigit() for c in url)
features['count_of_number_of_directories'] = len(parsed_url.path.split('/')) - 1
# Binary Features
shortening_services = (
"tr\\.im", "link\\.zip\\.net"
pattern = re.compile(r'|'.join(shortening_services))
return features
# Predict function
def predict_url(url):
features = extract_features(url)
features_df = pd.DataFrame([features])
prediction = model.predict(features_df)[0]
return 'Malicious' if prediction == 1 else 'Benign'
if __name__ == "__main__":
args = parser.parse_args()
url = args.url
result = predict_url(url)
"""
f.write(script_content)
archive_file = '/mnt/data/url_detection_project.rar'
archive_file ​:citation[【oaicite:0】]​