tfidfvectorizer: valueerror: not a built-in stop list: russian

  • Last Update :
  • Techknowledgy :

Import the following code

from sklearn.feature_extraction
import text
my_stop_words = text.ENGLISH_STOP_WORDS.union(["russian"])

Tfidf = sklearn.feature_extraction.text.TfidfVectorizer(stop_words = 'my_stop_words')
Z = Tfidf.fit_transform(X)

Suggestion : 2

.ENGLISH_STOP_WORDS , sklearn.feature_extraction.text.TfidfVectorizer()

def build_document_term_matrix(self):

   self.tfidf_vectorizer = TfidfVectorizer(
      stop_words = ENGLISH_STOP_WORDS, lowercase = True,
      strip_accents = "unicode",
      use_idf = True, norm = "l2", min_df = Constants.MIN_DICTIONARY_WORD_COUNT,
      max_df = Constants.MAX_DICTIONARY_WORD_COUNT, ngram_range = (1, 1))
self.document_term_matrix = \
   self.tfidf_vectorizer.fit_transform(self.target_bows)

vocabulary = self.tfidf_vectorizer.vocabulary_
num_terms = len(vocabulary)
self.terms = [""] * num_terms
for term in vocabulary.keys():
   self.terms[vocabulary[term]] = term

print "Created document-term matrix of size %d x %d" % (
   self.document_term_matrix.shape[0],
   self.document_term_matrix.shape[1]
)
def get_stopwords():
   nltk_stopwords = set(stopwords.words('english'))
sklearn_stopwords = stop_words.ENGLISH_STOP_WORDS

all_stopwords = set()
all_stopwords |= spacy_stopwords
all_stopwords |= nltk_stopwords
all_stopwords |= sklearn_stopwords

return all_stopwords
def _check_stop_list(stop):
   if stop == "english":
   return ENGLISH_STOP_WORDS
elif isinstance(stop, six.string_types):
   raise ValueError("not a built-in stop list: %s" % stop)
elif stop is None:
   return None
else: # assume it 's a collection
return frozenset(stop)
def __init__(self):

   self.model = None

self.spacynlp = spacy.load('en')

self.stopwords = set(STOP_WORDS + ["n't", "'s", "'m", "ca"] +
   list(ENGLISH_STOP_WORDS))

self.punctuations = " ".join(string.punctuation).split(" ") + \["-----", "---", "...", "'ve"]
def wordCount(text):
   try:
   text = text.lower()
regex = re.compile("[" + re.escape(string.punctuation) + "0-9\\r\\t\\n]")
txt = regex.sub(" ", text)
words = [
   w
   for w in txt.split(" ")
   if w not in stop_words.ENGLISH_STOP_WORDS and len(w) > 3
]
return len(words)
except Exception:
   return 0
def _loadSpecialWords(self):
   ''
' Load stop words, number prefixes, news agencies, and protest subject words. '
''
self.S_PREFIX = ['around', 'up to', 'as many as', 'some', 'many', 'nearly', 'more than', 'about']

self.P_SUBJ = {
   'protest': ['protesters', 'protestors', 'demonstrators', 'activists', 'strikers', 'marchers', 'signatures',
      'counter-demonstrators', 'counter-demonstraters', 'counter-protesters', 'counter-protestors', 'counterprotesters',
      'counterprotestors'
   ]
}

self.AGW = ['Agence France-Presse, English Service', 'Associated Press Worldstream, English Service']

self.SWS = list(stop_words.ENGLISH_STOP_WORDS)

Suggestion : 3

Get list of common stop words in various languages in Python.,Get list of common stop words in various languages in Python,stop-words is available on PyPI, pip install stop-words Copy PIP instructions

So easily install it by pip

$ pip install stop - words

Or by easy_install

$ easy_install stop - words

Another way is by cloning stop-words’s git repo

$ git clone--recursive git: //github.com/Alir3z4/python-stop-words.git
from stop_words
import get_stop_words

stop_words = get_stop_words('en')
stop_words = get_stop_words('english')

from stop_words
import safe_get_stop_words

stop_words = safe_get_stop_words('unsupported language')