Import the following code
from sklearn.feature_extraction
import text
my_stop_words = text.ENGLISH_STOP_WORDS.union(["russian"])
Tfidf = sklearn.feature_extraction.text.TfidfVectorizer(stop_words = 'my_stop_words')
Z = Tfidf.fit_transform(X)
Suggestion : 2
.ENGLISH_STOP_WORDS , sklearn.feature_extraction.text.TfidfVectorizer()
def build_document_term_matrix(self):
self.tfidf_vectorizer = TfidfVectorizer(
stop_words = ENGLISH_STOP_WORDS, lowercase = True,
strip_accents = "unicode",
use_idf = True, norm = "l2", min_df = Constants.MIN_DICTIONARY_WORD_COUNT,
max_df = Constants.MAX_DICTIONARY_WORD_COUNT, ngram_range = (1, 1))
self.document_term_matrix = \
self.tfidf_vectorizer.fit_transform(self.target_bows)
vocabulary = self.tfidf_vectorizer.vocabulary_
num_terms = len(vocabulary)
self.terms = [""] * num_terms
for term in vocabulary.keys():
self.terms[vocabulary[term]] = term
print "Created document-term matrix of size %d x %d" % (
self.document_term_matrix.shape[0],
self.document_term_matrix.shape[1]
)
def get_stopwords():
nltk_stopwords = set(stopwords.words('english'))
sklearn_stopwords = stop_words.ENGLISH_STOP_WORDS
all_stopwords = set()
all_stopwords |= spacy_stopwords
all_stopwords |= nltk_stopwords
all_stopwords |= sklearn_stopwords
return all_stopwords
def _check_stop_list(stop): if stop == "english": return ENGLISH_STOP_WORDS elif isinstance(stop, six.string_types): raise ValueError("not a built-in stop list: %s" % stop) elif stop is None: return None else: # assume it 's a collection return frozenset(stop)
def __init__(self):
self.model = None
self.spacynlp = spacy.load('en')
self.stopwords = set(STOP_WORDS + ["n't", "'s", "'m", "ca"] +
list(ENGLISH_STOP_WORDS))
self.punctuations = " ".join(string.punctuation).split(" ") + \["-----", "---", "...", "'ve"]
def wordCount(text):
try:
text = text.lower()
regex = re.compile("[" + re.escape(string.punctuation) + "0-9\\r\\t\\n]")
txt = regex.sub(" ", text)
words = [
w
for w in txt.split(" ")
if w not in stop_words.ENGLISH_STOP_WORDS and len(w) > 3
]
return len(words)
except Exception:
return 0
def _loadSpecialWords(self):
''
' Load stop words, number prefixes, news agencies, and protest subject words. '
''
self.S_PREFIX = ['around', 'up to', 'as many as', 'some', 'many', 'nearly', 'more than', 'about']
self.P_SUBJ = {
'protest': ['protesters', 'protestors', 'demonstrators', 'activists', 'strikers', 'marchers', 'signatures',
'counter-demonstrators', 'counter-demonstraters', 'counter-protesters', 'counter-protestors', 'counterprotesters',
'counterprotestors'
]
}
self.AGW = ['Agence France-Presse, English Service', 'Associated Press Worldstream, English Service']
self.SWS = list(stop_words.ENGLISH_STOP_WORDS)
Suggestion : 3
Get list of common stop words in various languages in Python.,Get list of common stop words in various languages in Python,stop-words is available on PyPI, pip install stop-words Copy PIP instructions
So easily install it by pip
$ pip install stop - words
Or by easy_install
$ easy_install stop - words
Another way is by cloning stop-words’s git repo
$ git clone--recursive git: //github.com/Alir3z4/python-stop-words.git
from stop_words
import get_stop_words
stop_words = get_stop_words('en')
stop_words = get_stop_words('english')
from stop_words
import safe_get_stop_words
stop_words = safe_get_stop_words('unsupported language')