from sklearn.feature_extraction.text import TfidfVectorizerfrom sklearn.metrics.pairwise import cosine_similarityfrom visual import show_tfidf docs = ["it is a good day, I like to stay here","I am happy to be here","I am bob","it is sunny today","I have a party today","it is a dog and that is a cat","there are dog and cat on the tree","I study hard this morning","today is a good day","tomorrow will be a good day","I like coffee, I like book and I like apple","I do not like it","I am kitty, I like bob","I do not care who like bob, but I like kitty","It is coffee time, bring your cup",]vectorizer = TfidfVectorizer()tf_idf = vectorizer.fit_transform(docs)print("idf: ", [(n, idf) for idf, n inzip(vectorizer.idf_, vectorizer.get_feature_names())])print("v2i: ", vectorizer.vocabulary_)q ="I get a coffee cup"qtf_idf = vectorizer.transform([q])res = cosine_similarity(tf_idf, qtf_idf)res = res.ravel().argsort()[-3:]print("\ntop 3 docs for '{}':\n{}".format(q, [docs[i] for i in res[::-1]]))i2v = {i: v for v, i in vectorizer.vocabulary_.items()}dense_tfidf = tf_idf.todense()show_tfidf(dense_tfidf, [i2v[i] for i inrange(dense_tfidf.shape[1])], "tfidf_sklearn_matrix")
idf: [('am', 2.386294361119891), ('and', 2.386294361119891), ('apple', 3.0794415416798357), ('are', 3.0794415416798357), ('be', 2.6739764335716716), ('bob', 2.386294361119891), ('book', 3.0794415416798357), ('bring', 3.0794415416798357), ('but', 3.0794415416798357), ('care', 3.0794415416798357), ('cat', 2.6739764335716716), ('coffee', 2.6739764335716716), ('cup', 3.0794415416798357), ('day', 2.386294361119891), ('do', 2.6739764335716716), ('dog', 2.6739764335716716), ('good', 2.386294361119891), ('happy', 3.0794415416798357), ('hard', 3.0794415416798357), ('have', 3.0794415416798357), ('here', 2.6739764335716716), ('is', 1.9808292530117262), ('it', 1.9808292530117262), ('kitty', 2.6739764335716716), ('like', 1.9808292530117262), ('morning', 3.0794415416798357), ('not', 2.6739764335716716), ('on', 3.0794415416798357), ('party', 3.0794415416798357), ('stay', 3.0794415416798357), ('study', 3.0794415416798357), ('sunny', 3.0794415416798357), ('that', 3.0794415416798357), ('the', 3.0794415416798357), ('there', 3.0794415416798357), ('this', 3.0794415416798357), ('time', 3.0794415416798357), ('to', 2.6739764335716716), ('today', 2.386294361119891), ('tomorrow', 3.0794415416798357), ('tree', 3.0794415416798357), ('who', 3.0794415416798357), ('will', 3.0794415416798357), ('your', 3.0794415416798357)]
v2i: {'it': 22, 'is': 21, 'good': 16, 'day': 13, 'like': 24, 'to': 37, 'stay': 29, 'here': 20, 'am': 0, 'happy': 17, 'be': 4, 'bob': 5, 'sunny': 31, 'today': 38, 'have': 19, 'party': 28, 'dog': 15, 'and': 1, 'that': 32, 'cat': 10, 'there': 34, 'are': 3, 'on': 27, 'the': 33, 'tree': 40, 'study': 30, 'hard': 18, 'this': 35, 'morning': 25, 'tomorrow': 39, 'will': 42, 'coffee': 11, 'book': 6, 'apple': 2, 'do': 14, 'not': 26, 'kitty': 23, 'care': 9, 'who': 41, 'but': 8, 'time': 36, 'bring': 7, 'your': 43, 'cup': 12}
top 3 docs for 'I get a coffee cup':
['It is coffee time, bring your cup', 'I like coffee, I like book and I like apple', 'I do not care who like bob, but I like kitty']