Photo by Jason Leung on Unsplash

When Magento meets Python (episode: Text Analysis)

import pandas as pddf = pd.read_csv(index_col='entity_id', delimiter='^', 
filepath_or_buffer='<.....>/madison.csv',
engine='python')
df.drop_duplicates(keep='first', subset='value', inplace=True)df=df.rename(columns = {'value':'descr'})df.replace(np.nan, '', regex=True, inplace=True)print(df)
from sklearn.feature_extraction.text import CountVectorizervectorizer = CountVectorizer()
corpus = df.descrdef words_frequency(vectorizer, corpus):    vect = vectorizer.fit_transform(corpus)    sum_words = vect.sum(axis=0)    words_freq = [(word, sum_words[0, idx]) for word, idx in vectorizer.vocabulary_.items()]    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)    return words_freq
wf_descr = words_frequency(vectorizer, corpus)print(wf_descr[0:100])
import nltkfrom nltk.corpus import stopwordsstop_words = set(stopwords.words('english'))
vectorizer = CountVectorizer(stop_words = stop_words)
wf_descr = words_frequency(vectorizer, corpus)
print(wf_descr[0:100])
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words = stop_words)
wf_descr_tf = words_frequency(vectorizer, corpus)print(wf_descr_tf[0:100])
vectorizer = TfidfVectorizer(ngram_range=(2,2), 
stop_words = stop_words)
wf_body_tf = words_frequency(vectorizer, corpus)print(wf_body_tf[0:100])

Tech consultant (antonellocalamea.com) | Avid learner | Composer | Proudly believing less is more, except for love and knowledge

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store