Models

`model.nlp`

`get_topic_words(topic, topic_word_dist, vocab, topn=5)`

returns the top n words for a given topic from the topic model

Parameters:

Name	Type	Description	Default
`topic`	`int`	index of the topic	required
`topic_word_dist`	`ndarray`	word distribution for each topic	required
`vocab`	`ndarray`	vocabulary from vectorizer	required
`topn`	`int`	number of top words to return. Defaults to 10.	`5`

Returns:

Type	Description
`ndarray`	np.ndarray: top n words for the given topic

Source code in src/model/nlp.py

def get_topic_words(topic : int, topic_word_dist : np.ndarray, vocab : np.ndarray, topn : int =5) -> np.ndarray:
    """returns the top n words for a given topic from the topic model

    Args:
        topic (int): index of the topic
        topic_word_dist (np.ndarray): word distribution for each topic
        vocab (np.ndarray): vocabulary from vectorizer
        topn (int, optional): number of top words to return. Defaults to 10.

    Returns:
        np.ndarray: top n words for the given topic
    """

    top_words = topic_word_dist[topic,:].argsort()[-topn:][::-1].tolist()
    return vocab[top_words]

`most_prevalent_topic(doc_topics)`

returns the most prevalent topic for each document

Parameters:

Name	Type	Description	Default
`doc_topics`	`ndarray`	fitted and transformed array of document topics	required

Returns:

Type	Description
`ndarray`	np.ndarray: topic index for each document

Source code in src/model/nlp.py

def most_prevalent_topic(doc_topics : np.ndarray) -> np.ndarray:
    """returns the most prevalent topic for each document

    Args:
        doc_topics (np.ndarray): fitted and transformed array of document topics

    Returns:
        np.ndarray: topic index for each document
    """

    return doc_topics.argmax(axis=1)

`top_features_sm(dtm, vec, n=10)`

Counts the top n features from a sparse matrix and returns a dictionary with the counts

Parameters:

Name	Type	Description	Default
`dtm`	`csr_matrix`	sparse matrix with the data	required
`vec`	`CountVectorizer`	sklearn countvetorizer to build vocabulary	required
`n`	`int`	number of top features to return. Defaults to 10.	`10`

Returns:

Type	Description
`dict[str, int]`	dict[str, int]: dictionary with the top features and their counts

Source code in src/model/nlp.py

def top_features_sm(dtm : sp.sparse.csr_matrix, vec : sklearn.feature_extraction.text.CountVectorizer, n : int =10) -> dict[str, int]:  # noqa: E501
    """Counts the top n features from a sparse matrix and returns a dictionary with the counts

    Args:
        dtm (sp.sparse.csr_matrix): sparse matrix with the data
        vec (CountVectorizer): sklearn countvetorizer to build vocabulary
        n (int, optional): number of top features to return. Defaults to 10.

    Returns:
        dict[str, int]: dictionary with the top features and their counts
    """

    import numpy as np
    words = np.array(vec.get_feature_names_out())
    dtm = dtm.sum(axis=0)  # Sum across all documents to get the frequency of each feature
    dtm = np.array(dtm).reshape(-1)
    top_indices = dtm.argsort()[-n:]  # Get the indices of the top n features
    top_indices = top_indices[::-1]  # Reverse the order of these indices

    # Create a dictionary with feature names and their counts
    top_features = {words[i]: dtm[i] for i in top_indices}
    return top_features

`topic_words_dist_ranked(topic_idx, topic_word_dist, vocab, num_words=10)`

returns the top n words for a given topic from the topic model

Parameters:

Name	Type	Description	Default
`topic_idx`	`int`	topic number	required
`topic_word_dist`	`ndarray`	distribution of words for each topic	required
`vocab`	`ndarray`	vocabulary from vectorizer	required
`num_words`	`int`	number of top words to return. Defaults to 10.	`10`

Returns:

Name	Type	Description
`str`	`str`	word

Source code in src/model/nlp.py

def topic_words_dist_ranked(topic_idx : int, topic_word_dist : np.ndarray, vocab : np.ndarray, num_words=10) -> str:
    """returns the top n words for a given topic from the topic model

    Args:
        topic_idx (int): topic number
        topic_word_dist (np.ndarray): distribution of words for each topic
        vocab (np.ndarray): vocabulary from vectorizer
        num_words (int, optional): number of top words to return. Defaults to 10.

    Returns:
        str: word
    """

    top_words = topic_word_dist[topic_idx]
    top_words_indices = top_words.argsort()[-num_words:][::-1]  # Get indices of top words
    return [vocab[i] for i in top_words_indices]