You can fix this by either adding the missing words to your corpus dictionary (or adding the words to the corpus and making a dictionary from that) or you can add this line to the site-packages\pyLDAvis\gensim.py code before "assert topic_term_dists.shape[0] == doc_topic_dists.shape[1]" (should be ~line 67)
topic_term_dists = topic_term_dists / topic_term_dists.sum(axis = 1)[: , None]
Run a sum on your document vectors. If there is indeed a 0, then you might want to drop that row/document.
np.sum(lda.transform(docu_term_matrix), axis = 1)
I tried generating topics using BTM. On anycodings_python trying to visualize the topics, I get a anycodings_python validation error. I can print the topics anycodings_python after model training, but it fails on using anycodings_python pyLDAvis,I got the below error on trying to after anycodings_python running above pyLDAvis,Trying to connect my application with SQL Server using sql_server_socket. In SqlConnection.dart the guinness package's version is not specified right,Manually calculating IEEE-754 floating point fractions and splitting up the bits - Python
I tried generating topics using BTM. On anycodings_python trying to visualize the topics, I get a anycodings_python validation error. I can print the topics anycodings_python after model training, but it fails on using anycodings_python pyLDAvis
def btm_model():
num_topics = 10
texts = open('./textfiles/Ori-Apr2, 2019.txt').read().splitlines()
# vectorize texts
vec = CountVectorizer(stop_words = 'english')
X = vec.fit_transform(texts).toarray()
# get vocabulary
vocab = np.array(vec.get_feature_names())
# get biterms
biterms = vec_to_biterms(X)
# create btm
btm = oBTM(num_topics = num_topics, V = vocab)
print("\n\n Train Online BTM ..")
for i in range(0, 1):
biterms_chunk = biterms[i: i + 100]
btm.fit(biterms_chunk, iterations = 10)
print("\n\n Topic coherence ..")
res, C_z_sum = topic_summuary(btm.phi_wz.T, X, vocab, 10)
topics = btm.transform(biterms)
print("\n\n Visualize Topics ..")
vis = pyLDAvis.prepare(btm.phi_wz.T, topics, np.count_nonzero(X, axis = 1), vocab, np.sum(X, axis = 0))
pyLDAvis.save_html(vis, './textfiles/online_btm.html')
I got the below error on trying to after anycodings_python running above pyLDAvis
Traceback (most recent call last):
File "main_mining.py", line 293, in <module>
btm_model(num_topics)
File "main_mining.py", line 187, in btm_model
vis = pyLDAvis.prepare(btm.phi_wz.T, topics, np.count_nonzero(X, axis=1), vocab, np.sum(X, axis=0))
File "C:\Python Install Location\lib\site-packages\pyLDAvis\_prepare.py", line 375, in prepare
_input_validate(topic_term_dists, doc_topic_dists, doc_lengths, vocab, term_frequency)
File "C:\Python Install Location\lib\site-packages\pyLDAvis\_prepare.py", line 65, in _input_validate
raise ValidationError('\n' + '\n'.join([' * ' + s for s in res]))
pyLDAvis._prepare.ValidationError:
* Not all rows (distributions) in doc_topic_dists sum to 1.
Updated July 21st, 2022
If you feed the model data, it will give you different sets of words, and each set of words describes the topic.
(0, '0.024*"ban" + 0.017*"order" + 0.015*"refugee" + 0.015*"law" + 0.013*"trump" '
'+ 0.011*"kill" + 0.011*"country" + 0.010*"attack" + 0.009*"state" + '
'0.009*"immigration"')
(1, '0.020*"student" + 0.020*"work" + 0.019*"great" + 0.017*"learn" + '
'0.017*"school" + 0.015*"talk" + 0.014*"support" + 0.012*"community" + '
'0.010*"share" + 0.009*"event")
Install pyLDAvis with:
pip install pyldavis
Moving on, let’s import relevant libraries:
import gensim
import gensim.corpora as corpora
from gensim.corpora
import Dictionary
from gensim.models.coherencemodel
import CoherenceModel
from gensim.models.ldamodel
import LdaModel
from pprint
import pprint
import spacy
import pickle
import re
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
import pandas as pd
We start with converting a collection of words to a bag of words, which is a list of tuples (word_id, word_frequency). gensim.corpora.Dictionary is a great tool for this:
id2word = Dictionary(tweets) # Term Document Frequency corpus = [id2word.doc2bow(text) for text in tweets] print(corpus[: 1]) [[(0, 1), (1, 1), (2, 1), (3, 3), (4, 1), (5, 2), (6, 2), (7, 1), (8, 1), (9, 1), (10, 1), (11, 2), (12, 2), (13, 1), (14, 1), (15, 1), (16, 2), (17, 1), (18, 1), (19, 1), (20, 2), (21, 1), (22, 1), (23, 1), (24, 1), (25, 2), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), ..., (347, 1), (348, 1), (349, 2), (350, 1), (351, 1), (352, 1), (353, 1), (354, 1), (355, 1), (356, 1), (357, 1), (358, 1), (359, 1), (360, 1), (361, 1), (362, 2), (363, 1), (364, 4), (365, 1), (366, 1), (367, 3), (368, 1), (369, 8), (370, 1), (371, 1), (372, 1), (373, 4)]]
What do these tuples mean? Let’s convert them into human readable format to understand:
[
[(id2word[i], freq) for i, freq in doc]
for doc in corpus[: 1]
]
[
[("'d", 1),
('-', 1),
('absolutely', 1),
('aca', 3),
('act', 1),
('action', 2),
('add', 2),
('administrative', 1),
('affordable', 1),
('allow', 1),
('amazing', 1),
...
('way', 4),
('week', 1),
('well', 1),
('will', 3),
('wonder', 1),
('work', 8),
('world', 1),
('writing', 1),
('wrong', 1),
('year', 4)
]
]
In this article, we will use a million news headlines dataset from Kaggle. If you want to follow the analysis step-by-step you may want to install the following libraries:
pip install\ pandas matplotlib numpy\ nltk seaborn sklearn gensim pyldavis\ wordcloud textblob spacy textstat
Now, we can take a look at the data.
news = pd.read_csv('data/abcnews-date-text.csv', nrows = 10000)
news.head(3)
December 4, 2018
Let’s begin by importing the packages and the 20 News Groups dataset.
import sys #!{ sys.executable } - m spacy download en import re, numpy as np, pandas as pd from pprint import pprint # Gensim import gensim, spacy, logging, warnings import gensim.corpora as corpora from gensim.utils import lemmatize, simple_preprocess from gensim.models import CoherenceModel import matplotlib.pyplot as plt # NLTK Stop words from nltk.corpus import stopwords stop_words = stopwords.words('english') stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say', 'could', '_', 'be', 'know', 'good', 'go', 'get', 'do', 'done', 'try', 'many', 'some', 'nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 'need', 'even', 'right', 'line', 'even', 'also', 'may', 'take', 'come']) % matplotlib inline warnings.filterwarnings("ignore", category = DeprecationWarning) logging.basicConfig(format = '%(asctime)s : %(levelname)s : %(message)s', level = logging.ERROR)
Let’s import the news groups dataset and retain only 4 of the target_names
categories.
# Import Dataset df = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json') df = df.loc[df.target_names.isin(['soc.religion.christian', 'rec.sport.hockey', 'talk.politics.mideast', 'rec.motorcycles']),: ] print(df.shape) # > (2361, 3) df.head()
Removing the emails, new line characters, single quotes and finally split the sentence into a list of words using gensim’s simple_preprocess()
. Setting the deacc=True
option removes punctuations.
def sent_to_words(sentences):
for sent in sentences:
sent = re.sub('\S*@\S*\s?', '', sent) # remove emails
sent = re.sub('\s+', ' ', sent) # remove newline chars
sent = re.sub("\'", "", sent) # remove single quotes
sent = gensim.utils.simple_preprocess(str(sent), deacc = True)
yield(sent)
# Convert to list
data = df.content.values.tolist()
data_words = list(sent_to_words(data))
print(data_words[: 1])
#[['from', 'irwin', 'arnstein', 'subject', 're', 'recommendation', 'on', 'duc', 'summary', 'whats', 'it', 'worth', 'distribution', 'usa', 'expires', 'sat', 'may', 'gmt', ...trucated...]]
# Create Dictionary id2word = corpora.Dictionary(data_ready) # Create Corpus: Term Document Frequency corpus = [id2word.doc2bow(text) for text in data_ready] # Build LDA model lda_model = gensim.models.ldamodel.LdaModel(corpus = corpus, id2word = id2word, num_topics = 4, random_state = 100, update_every = 1, chunksize = 10, passes = 10, alpha = 'symmetric', iterations = 100, per_word_topics = True) pprint(lda_model.print_topics()) # > [(0, # > '0.017*"write" + 0.015*"people" + 0.014*"organization" + 0.014*"article" + ' # > '0.013*"time" + 0.008*"give" + 0.008*"first" + 0.007*"tell" + 0.007*"new" + ' # > '0.007*"question"'), # > (1, # > '0.008*"christian" + 0.008*"believe" + 0.007*"god" + 0.007*"law" + ' # > '0.006*"state" + 0.006*"israel" + 0.006*"israeli" + 0.005*"exist" + ' # > '0.005*"way" + 0.004*"bible"'), # > (2, # > '0.024*"armenian" + 0.012*"bike" + 0.006*"kill" + 0.006*"work" + ' # > '0.005*"well" + 0.005*"year" + 0.005*"sumgait" + 0.005*"soldier" + ' # > '0.004*"way" + 0.004*"ride"'), # > (3, # > '0.019*"team" + 0.019*"game" + 0.013*"hockey" + 0.010*"player" + ' # > '0.009*"play" + 0.009*"win" + 0.009*"nhl" + 0.009*"year" + 0.009*"hawk" + ' # > '0.009*"season"') ]
This way, you will know which document belongs predominantly to which topic.
def format_topics_sentences(ldamodel = None, corpus = corpus, texts = data): # Init output sent_topics_df = pd.DataFrame() # Get main topic in each document for i, row_list in enumerate(ldamodel[corpus]): row = row_list[0] if ldamodel.per_word_topics else row_list # print(row) row = sorted(row, key = lambda x: (x[1]), reverse = True) # Get the Dominant topic, Perc Contribution and Keywords for each document for j, (topic_num, prop_topic) in enumerate(row): if j == 0: # => dominant topic wp = ldamodel.show_topic(topic_num) topic_keywords = ", ".join([word for word, prop in wp ]) sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic, 4), topic_keywords]), ignore_index = True) else: break sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords'] # Add original text to the end of the output contents = pd.Series(texts) sent_topics_df = pd.concat([sent_topics_df, contents], axis = 1) return (sent_topics_df) df_topic_sents_keywords = format_topics_sentences(ldamodel = lda_model, corpus = corpus, texts = data_ready) # Format df_dominant_topic = df_topic_sents_keywords.reset_index() df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text'] df_dominant_topic.head(10)