pyspark-nlp-from-scratch
pyspark-nlp-from-scratch
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
import pandas as pd
import seaborn as sns
spark = (
SparkSession.builder.appName("ModelTraining")
.config("spark.executor.memory", "6g")
.getOrCreate()
)
import html
schema = "polarity FLOAT, id LONG, date_time TIMESTAMP, query STRING, user STRING, text STRING"
timestampformat = "EEE MMM dd HH:mm:ss zzz yyyy"
spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")
IN_PATH_RAW = "/kaggle/input/twitter-nlp/training.1600000.processed.noemoticon.csv"
IN_PATH_TEST = "/kaggle/input/twitter-nlp/testdata.manual.2009.06.14.csv"
#OUT_PATH_CLEAN = "CLEAN"
spark_reader = spark.read.schema(schema)
user_regex = r"(@\w{1,15})"
hashtag_regex = "(#\w{1,})"
url_regex=r"((https?|ftp|file):\/{2,3})+([-\w+&@#/%=~|$?!:,.]*)|(www.)+([-\w+&@#/%=~|$?!:,.]*)"
email_regex=r"[\w.-]+@[\w.-]+\.[a-zA-Z]{1,}"
@f.udf
def html_unescape(s: str):
if isinstance(s, str):
return html.unescape(s)
return s
def clean_data(df):
df = (
df
.withColumn("original_text", f.col("text"))
.withColumn("text", f.regexp_replace(f.col("text"), url_regex, ""))
.withColumn("text", f.regexp_replace(f.col("text"), email_regex, ""))
.withColumn("text", f.regexp_replace(f.col("text"), user_regex, ""))
.withColumn("text", f.regexp_replace(f.col("text"), "#", " "))
.withColumn("text", html_unescape(f.col("text")))
.filter("text != ''")
)
return df
df_train_clean.show(10,True)
df_test_clean.show(10,True)
traindf = (
df_train_clean
# Remove all numbers
.withColumn("text", f.regexp_replace(f.col("text"), "[^a-zA-Z']", " "))
# Remove all double/multiple spaces
.withColumn("text", f.regexp_replace(f.col("text"), " +", " "))
# Remove leading and trailing whitespaces
.withColumn("text", f.trim(f.col("text")))
# Ensure we don't end up with empty rows
.filter("text != ''")
)
df_test = (
df_test_clean
# Remove all numbers
.withColumn("text", f.regexp_replace(f.col("text"), "[^a-zA-Z']", " "))
# Remove all double/multiple spaces
.withColumn("text", f.regexp_replace(f.col("text"), " +", " "))
# Remove leading and trailing whitespaces
.withColumn("text", f.trim(f.col("text")))
# Ensure we don't end up with empty rows
.filter("text != ''")
)
%%time
from pyspark.ml.feature import (
StopWordsRemover,
Tokenizer,
HashingTF,
g ,
IDF,
)
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
semantic_analysis_pipeline = Pipeline(
stages=[tokenizer, stopwords_remover, hashing_tf, idf, lr]
)
semantic_analysis_model = semantic_analysis_pipeline.fit(traindata)
semantic_analysis_model.transform(testdata).show()