TF_ID IF

Published

October 16, 2022

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from visual import show_tfidf  


docs = [
    "it is a good day, I like to stay here",
    "I am happy to be here",
    "I am bob",
    "it is sunny today",
    "I have a party today",
    "it is a dog and that is a cat",
    "there are dog and cat on the tree",
    "I study hard this morning",
    "today is a good day",
    "tomorrow will be a good day",
    "I like coffee, I like book and I like apple",
    "I do not like it",
    "I am kitty, I like bob",
    "I do not care who like bob, but I like kitty",
    "It is coffee time, bring your cup",
]

vectorizer = TfidfVectorizer()
tf_idf = vectorizer.fit_transform(docs)
print("idf: ", [(n, idf) for idf, n in zip(vectorizer.idf_, vectorizer.get_feature_names())])
print("v2i: ", vectorizer.vocabulary_)


q = "I get a coffee cup"
qtf_idf = vectorizer.transform([q])
res = cosine_similarity(tf_idf, qtf_idf)
res = res.ravel().argsort()[-3:]
print("\ntop 3 docs for '{}':\n{}".format(q, [docs[i] for i in res[::-1]]))


i2v = {i: v for v, i in vectorizer.vocabulary_.items()}
dense_tfidf = tf_idf.todense()
show_tfidf(dense_tfidf, [i2v[i] for i in range(dense_tfidf.shape[1])], "tfidf_sklearn_matrix")
idf:  [('am', 2.386294361119891), ('and', 2.386294361119891), ('apple', 3.0794415416798357), ('are', 3.0794415416798357), ('be', 2.6739764335716716), ('bob', 2.386294361119891), ('book', 3.0794415416798357), ('bring', 3.0794415416798357), ('but', 3.0794415416798357), ('care', 3.0794415416798357), ('cat', 2.6739764335716716), ('coffee', 2.6739764335716716), ('cup', 3.0794415416798357), ('day', 2.386294361119891), ('do', 2.6739764335716716), ('dog', 2.6739764335716716), ('good', 2.386294361119891), ('happy', 3.0794415416798357), ('hard', 3.0794415416798357), ('have', 3.0794415416798357), ('here', 2.6739764335716716), ('is', 1.9808292530117262), ('it', 1.9808292530117262), ('kitty', 2.6739764335716716), ('like', 1.9808292530117262), ('morning', 3.0794415416798357), ('not', 2.6739764335716716), ('on', 3.0794415416798357), ('party', 3.0794415416798357), ('stay', 3.0794415416798357), ('study', 3.0794415416798357), ('sunny', 3.0794415416798357), ('that', 3.0794415416798357), ('the', 3.0794415416798357), ('there', 3.0794415416798357), ('this', 3.0794415416798357), ('time', 3.0794415416798357), ('to', 2.6739764335716716), ('today', 2.386294361119891), ('tomorrow', 3.0794415416798357), ('tree', 3.0794415416798357), ('who', 3.0794415416798357), ('will', 3.0794415416798357), ('your', 3.0794415416798357)]
v2i:  {'it': 22, 'is': 21, 'good': 16, 'day': 13, 'like': 24, 'to': 37, 'stay': 29, 'here': 20, 'am': 0, 'happy': 17, 'be': 4, 'bob': 5, 'sunny': 31, 'today': 38, 'have': 19, 'party': 28, 'dog': 15, 'and': 1, 'that': 32, 'cat': 10, 'there': 34, 'are': 3, 'on': 27, 'the': 33, 'tree': 40, 'study': 30, 'hard': 18, 'this': 35, 'morning': 25, 'tomorrow': 39, 'will': 42, 'coffee': 11, 'book': 6, 'apple': 2, 'do': 14, 'not': 26, 'kitty': 23, 'care': 9, 'who': 41, 'but': 8, 'time': 36, 'bring': 7, 'your': 43, 'cup': 12}

top 3 docs for 'I get a coffee cup':
['It is coffee time, bring your cup', 'I like coffee, I like book and I like apple', 'I do not care who like bob, but I like kitty']

See you tomorrow !