webleaf.model.TextModel
1from sentence_transformers import SentenceTransformer 2import nltk 3from nltk.tokenize import sent_tokenize 4 5# The dimensionality of the text embeddings produced by the model 6TEXT_DIMS = 384 7 8class TextEmbeddingModel: 9 """ 10 A class to handle text embeddings for sentences using a pre-trained transformer model. 11 12 This class uses the SentenceTransformer model from the HuggingFace 'sentence-transformers/multi-qa-MiniLM-L6-cos-v1' 13 to encode text into dense embeddings. The embeddings are designed to capture the semantic meaning of the text. 14 15 Attributes: 16 ----------- 17 model : SentenceTransformer 18 The pre-trained SentenceTransformer model used to generate text embeddings. 19 20 Methods: 21 -------- 22 get_text_embeddings(text): 23 Generates embeddings for the input text data. 24 """ 25 def __init__(self): 26 """ 27 Initializes the TextEmbeddingModel by downloading necessary NLTK data and loading the pre-trained model. 28 29 NLTK's 'punkt' tokenizer is downloaded to tokenize input text into sentences, and the SentenceTransformer model 30 'multi-qa-MiniLM-L6-cos-v1' is loaded to generate embeddings. 31 """ 32 for resource in ["punkt", "punkt_tab"]: 33 nltk.download(resource) 34 self.model = SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-cos-v1') 35 36 def get_text_embeddings(self, text): 37 """ 38 Generates embeddings for a list of text strings. Each text string is tokenized into its first sentence, 39 and that sentence is encoded into a dense embedding using the pre-trained SentenceTransformer model. 40 41 Parameters: 42 ----------- 43 text : list of str 44 A list of text strings for which embeddings are to be generated. 45 46 Returns: 47 -------- 48 numpy.ndarray 49 A 2D array of embeddings where each row represents the embedding of a sentence from the input text. 50 """ 51 sentences = [] 52 for t in text: 53 sentence = sent_tokenize(t) 54 if sentence: 55 sentences.append(sentence[0]) 56 else: 57 sentences.append("") 58 return self.model.encode(sentences)
9class TextEmbeddingModel: 10 """ 11 A class to handle text embeddings for sentences using a pre-trained transformer model. 12 13 This class uses the SentenceTransformer model from the HuggingFace 'sentence-transformers/multi-qa-MiniLM-L6-cos-v1' 14 to encode text into dense embeddings. The embeddings are designed to capture the semantic meaning of the text. 15 16 Attributes: 17 ----------- 18 model : SentenceTransformer 19 The pre-trained SentenceTransformer model used to generate text embeddings. 20 21 Methods: 22 -------- 23 get_text_embeddings(text): 24 Generates embeddings for the input text data. 25 """ 26 def __init__(self): 27 """ 28 Initializes the TextEmbeddingModel by downloading necessary NLTK data and loading the pre-trained model. 29 30 NLTK's 'punkt' tokenizer is downloaded to tokenize input text into sentences, and the SentenceTransformer model 31 'multi-qa-MiniLM-L6-cos-v1' is loaded to generate embeddings. 32 """ 33 for resource in ["punkt", "punkt_tab"]: 34 nltk.download(resource) 35 self.model = SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-cos-v1') 36 37 def get_text_embeddings(self, text): 38 """ 39 Generates embeddings for a list of text strings. Each text string is tokenized into its first sentence, 40 and that sentence is encoded into a dense embedding using the pre-trained SentenceTransformer model. 41 42 Parameters: 43 ----------- 44 text : list of str 45 A list of text strings for which embeddings are to be generated. 46 47 Returns: 48 -------- 49 numpy.ndarray 50 A 2D array of embeddings where each row represents the embedding of a sentence from the input text. 51 """ 52 sentences = [] 53 for t in text: 54 sentence = sent_tokenize(t) 55 if sentence: 56 sentences.append(sentence[0]) 57 else: 58 sentences.append("") 59 return self.model.encode(sentences)
A class to handle text embeddings for sentences using a pre-trained transformer model.
This class uses the SentenceTransformer model from the HuggingFace 'sentence-transformers/multi-qa-MiniLM-L6-cos-v1' to encode text into dense embeddings. The embeddings are designed to capture the semantic meaning of the text.
Attributes:
model : SentenceTransformer The pre-trained SentenceTransformer model used to generate text embeddings.
Methods:
get_text_embeddings(text): Generates embeddings for the input text data.
26 def __init__(self): 27 """ 28 Initializes the TextEmbeddingModel by downloading necessary NLTK data and loading the pre-trained model. 29 30 NLTK's 'punkt' tokenizer is downloaded to tokenize input text into sentences, and the SentenceTransformer model 31 'multi-qa-MiniLM-L6-cos-v1' is loaded to generate embeddings. 32 """ 33 for resource in ["punkt", "punkt_tab"]: 34 nltk.download(resource) 35 self.model = SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-cos-v1')
Initializes the TextEmbeddingModel by downloading necessary NLTK data and loading the pre-trained model.
NLTK's 'punkt' tokenizer is downloaded to tokenize input text into sentences, and the SentenceTransformer model 'multi-qa-MiniLM-L6-cos-v1' is loaded to generate embeddings.
37 def get_text_embeddings(self, text): 38 """ 39 Generates embeddings for a list of text strings. Each text string is tokenized into its first sentence, 40 and that sentence is encoded into a dense embedding using the pre-trained SentenceTransformer model. 41 42 Parameters: 43 ----------- 44 text : list of str 45 A list of text strings for which embeddings are to be generated. 46 47 Returns: 48 -------- 49 numpy.ndarray 50 A 2D array of embeddings where each row represents the embedding of a sentence from the input text. 51 """ 52 sentences = [] 53 for t in text: 54 sentence = sent_tokenize(t) 55 if sentence: 56 sentences.append(sentence[0]) 57 else: 58 sentences.append("") 59 return self.model.encode(sentences)
Generates embeddings for a list of text strings. Each text string is tokenized into its first sentence, and that sentence is encoded into a dense embedding using the pre-trained SentenceTransformer model.
Parameters:
text : list of str A list of text strings for which embeddings are to be generated.
Returns:
numpy.ndarray A 2D array of embeddings where each row represents the embedding of a sentence from the input text.