import re
from nltk.util import ngrams
#nltk contain 157
#sklearn contain 318
from sklearn.feature_extraction.text import\
ENGLISH_STOP_WORDS as sklearn_stop_words
nltk.download('stopwords')

#tokens with re
sentence = """Thomas Jefferson began building Monticello at the\
age of 26."""
pattern = re.compile(r"([-\s.,;!?])+")
tokens = pattern.split(sentence)
tokens = [x for x in tokens if x and x not in '- \t\n.,;!?']
tokens

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'theage',
 'of',
 '26']

two_grams=list(ngrams(tokens, 2))
[" ".join(x) for x in two_grams]

['Thomas Jefferson',
 'Jefferson began',
 'began building',
 'building Monticello',
 'Monticello at',
 'at theage',
 'theage of',
 'of 26']

three_grams=list(ngrams(tokens, 3))
[" ".join(x) for x in three_grams]

['Thomas Jefferson began',
 'Jefferson began building',
 'began building Monticello',
 'building Monticello at',
 'Monticello at theage',
 'at theage of',
 'theage of 26']

stop_words = nltk.corpus.stopwords.words('english')

len(stop_words)

179

stop_words[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

#words with len ==1
with_len_lessthenone=[x for x in stop_words if len(x)==1]
with_len_lessthenone

['i', 'a', 's', 't', 'd', 'm', 'o', 'y']