webleaf.model.TextModel

 1from sentence_transformers import SentenceTransformer
 2import nltk
 3from nltk.tokenize import sent_tokenize
 4
 5# The dimensionality of the text embeddings produced by the model
 6TEXT_DIMS = 384
 7
 8class TextEmbeddingModel:
 9    """
10    A class to handle text embeddings for sentences using a pre-trained transformer model.
11
12    This class uses the SentenceTransformer model from the HuggingFace 'sentence-transformers/multi-qa-MiniLM-L6-cos-v1'
13    to encode text into dense embeddings. The embeddings are designed to capture the semantic meaning of the text.
14
15    Attributes:
16    -----------
17    model : SentenceTransformer
18        The pre-trained SentenceTransformer model used to generate text embeddings.
19
20    Methods:
21    --------
22    get_text_embeddings(text):
23        Generates embeddings for the input text data.
24    """
25    def __init__(self):
26        """
27        Initializes the TextEmbeddingModel by downloading necessary NLTK data and loading the pre-trained model.
28
29        NLTK's 'punkt' tokenizer is downloaded to tokenize input text into sentences, and the SentenceTransformer model
30        'multi-qa-MiniLM-L6-cos-v1' is loaded to generate embeddings.
31        """
32        for resource in ["punkt", "punkt_tab"]:
33            nltk.download(resource)
34        self.model = SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-cos-v1')
35
36    def get_text_embeddings(self, text):
37        """
38        Generates embeddings for a list of text strings. Each text string is tokenized into its first sentence,
39        and that sentence is encoded into a dense embedding using the pre-trained SentenceTransformer model.
40
41        Parameters:
42        -----------
43        text : list of str
44            A list of text strings for which embeddings are to be generated.
45
46        Returns:
47        --------
48        numpy.ndarray
49            A 2D array of embeddings where each row represents the embedding of a sentence from the input text.
50        """
51        sentences = []
52        for t in text:
53            sentence = sent_tokenize(t)
54            if sentence:
55                sentences.append(sentence[0])
56            else:
57                sentences.append("")
58        return self.model.encode(sentences)
TEXT_DIMS = 384
class TextEmbeddingModel:
 9class TextEmbeddingModel:
10    """
11    A class to handle text embeddings for sentences using a pre-trained transformer model.
12
13    This class uses the SentenceTransformer model from the HuggingFace 'sentence-transformers/multi-qa-MiniLM-L6-cos-v1'
14    to encode text into dense embeddings. The embeddings are designed to capture the semantic meaning of the text.
15
16    Attributes:
17    -----------
18    model : SentenceTransformer
19        The pre-trained SentenceTransformer model used to generate text embeddings.
20
21    Methods:
22    --------
23    get_text_embeddings(text):
24        Generates embeddings for the input text data.
25    """
26    def __init__(self):
27        """
28        Initializes the TextEmbeddingModel by downloading necessary NLTK data and loading the pre-trained model.
29
30        NLTK's 'punkt' tokenizer is downloaded to tokenize input text into sentences, and the SentenceTransformer model
31        'multi-qa-MiniLM-L6-cos-v1' is loaded to generate embeddings.
32        """
33        for resource in ["punkt", "punkt_tab"]:
34            nltk.download(resource)
35        self.model = SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-cos-v1')
36
37    def get_text_embeddings(self, text):
38        """
39        Generates embeddings for a list of text strings. Each text string is tokenized into its first sentence,
40        and that sentence is encoded into a dense embedding using the pre-trained SentenceTransformer model.
41
42        Parameters:
43        -----------
44        text : list of str
45            A list of text strings for which embeddings are to be generated.
46
47        Returns:
48        --------
49        numpy.ndarray
50            A 2D array of embeddings where each row represents the embedding of a sentence from the input text.
51        """
52        sentences = []
53        for t in text:
54            sentence = sent_tokenize(t)
55            if sentence:
56                sentences.append(sentence[0])
57            else:
58                sentences.append("")
59        return self.model.encode(sentences)

A class to handle text embeddings for sentences using a pre-trained transformer model.

This class uses the SentenceTransformer model from the HuggingFace 'sentence-transformers/multi-qa-MiniLM-L6-cos-v1' to encode text into dense embeddings. The embeddings are designed to capture the semantic meaning of the text.

Attributes:

model : SentenceTransformer The pre-trained SentenceTransformer model used to generate text embeddings.

Methods:

get_text_embeddings(text): Generates embeddings for the input text data.

TextEmbeddingModel()
26    def __init__(self):
27        """
28        Initializes the TextEmbeddingModel by downloading necessary NLTK data and loading the pre-trained model.
29
30        NLTK's 'punkt' tokenizer is downloaded to tokenize input text into sentences, and the SentenceTransformer model
31        'multi-qa-MiniLM-L6-cos-v1' is loaded to generate embeddings.
32        """
33        for resource in ["punkt", "punkt_tab"]:
34            nltk.download(resource)
35        self.model = SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-cos-v1')

Initializes the TextEmbeddingModel by downloading necessary NLTK data and loading the pre-trained model.

NLTK's 'punkt' tokenizer is downloaded to tokenize input text into sentences, and the SentenceTransformer model 'multi-qa-MiniLM-L6-cos-v1' is loaded to generate embeddings.

model
def get_text_embeddings(self, text):
37    def get_text_embeddings(self, text):
38        """
39        Generates embeddings for a list of text strings. Each text string is tokenized into its first sentence,
40        and that sentence is encoded into a dense embedding using the pre-trained SentenceTransformer model.
41
42        Parameters:
43        -----------
44        text : list of str
45            A list of text strings for which embeddings are to be generated.
46
47        Returns:
48        --------
49        numpy.ndarray
50            A 2D array of embeddings where each row represents the embedding of a sentence from the input text.
51        """
52        sentences = []
53        for t in text:
54            sentence = sent_tokenize(t)
55            if sentence:
56                sentences.append(sentence[0])
57            else:
58                sentences.append("")
59        return self.model.encode(sentences)

Generates embeddings for a list of text strings. Each text string is tokenized into its first sentence, and that sentence is encoded into a dense embedding using the pre-trained SentenceTransformer model.

Parameters:

text : list of str A list of text strings for which embeddings are to be generated.

Returns:

numpy.ndarray A 2D array of embeddings where each row represents the embedding of a sentence from the input text.