Spaces:

Cachoups
/

FinanceReport

Sleeping

App Files Files Community

Cachoups commited on Sep 23, 2024

Commit

1aa667b

verified ·

1 Parent(s): 367182a

Update lib/comparison.py

Browse files

Files changed (1) hide show

lib/comparison.py +48 -94

lib/comparison.py CHANGED Viewed

@@ -1,94 +1,48 @@
-from transformers import BertTokenizer, BertForSequenceClassification, pipeline, BertModel,T5ForConditionalGeneration, T5Tokenizer
-import torch
-from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.metrics.pairwise import cosine_similarity
-import numpy as np
-# Load the pre-trained FinBERT model for sentiment analysis
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-finbert_model_name = "yiyanghkust/finbert-tone"
-finbert_tokenizer = BertTokenizer.from_pretrained(finbert_model_name)
-finbert_model = BertForSequenceClassification.from_pretrained(finbert_model_name)
-finbert_model.to(device)
-finbert_pipeline = pipeline("sentiment-analysis", model=finbert_model, tokenizer=finbert_tokenizer, device=0 if device.type == "cuda" else -1)
-# Load the pre-trained T5 model for summarization
-t5_model_name = "t5-small"  # You can also use "t5-base" or "t5-large" for better summaries
-t5_model = T5ForConditionalGeneration.from_pretrained(t5_model_name).to(device)
-t5_tokenizer = T5Tokenizer.from_pretrained(t5_model_name)
-def analyze_and_summarize_paragraphs(paragraphs):
-    """Perform sentiment analysis and summarization on each paragraph."""
-    results = []
-    for paragraph in paragraphs:
-        # Perform sentiment analysis using FinBERT
-        sentiment_result = finbert_pipeline(paragraph)
-        # Perform summarization using T5
-        t5_input = f"summarize: {paragraph}"
-        input_ids = t5_tokenizer.encode(t5_input, return_tensors="pt").to(device)
-        summary_ids = t5_model.generate(input_ids, max_length=80, num_beams=6, early_stopping=True)
-        summary = t5_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
-        # Store only positive or negative sentiment results
-        if sentiment_result and sentiment_result[0]['label'] in ['Positive', 'Negative', 'Neutral']:
-            results.append({
-                "paragraph_text": paragraph,
-                "summary": summary,
-                "sentiment": sentiment_result[0]
-            })
-    return results
-bert_model_name = "bert-base-uncased"
-tokenizer = BertTokenizer.from_pretrained(bert_model_name)
-model = BertModel.from_pretrained(bert_model_name)
-model.eval()  # Set to evaluation mode
-# Word embedding on summary text using BERT
-def get_bert_embeddings(texts):
-    """Obtain BERT embeddings for a list of texts."""
-    embeddings = []
-    with torch.no_grad():
-        for text in texts:
-            inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
-            outputs = model(**inputs)
-            # Take the mean of token embeddings as the sentence embedding
-            embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
-            embeddings.append(embedding)
-    return np.array(embeddings)
-# Compute similirity matrices over embeddings
-def compute_similarity(embeddings1, embeddings2):
-    """Compute pairwise cosine similarity between two sets of embeddings."""
-    return cosine_similarity(embeddings1, embeddings2)
-# For each paragraphs summed up, get the most close summary from other year and compare contents
-def compare_summaries(results1, results2):
-    """Compare summaries from two documents and return similarity scores."""
-    # Get embeddings for each set of summaries
-    summaries1 = [result['summary'] for result in results1]
-    summaries2 = [result['summary'] for result in results2]
-    sentiment1 = [result['sentiment'] for result in results1]
-    sentiment2 = [result['sentiment'] for result in results2]
-    embeddings1 = get_bert_embeddings(summaries1)
-    embeddings2 = get_bert_embeddings(summaries2)
-    # Compute similarity
-    similarity_matrix = compute_similarity(embeddings1, embeddings2)
-    # Analyze matches
-    matches = []
-    for i, row in enumerate(similarity_matrix):
-        most_similar_index = np.argmax(row)
-        similarity_score = row[most_similar_index]
-        matches.append({
-            'summary_doc1': summaries1[i],
-            'summary_doc2': summaries2[most_similar_index],
-            'sentiment_doc1': sentiment1[i],
-            'sentiment_doc2': sentiment2[most_similar_index],
-            'similarity_score': similarity_score
-        })
-    return matches

+from transformers import BertTokenizer, BertModel
+import torch
+from sklearn.metrics.pairwise import cosine_similarity
+import numpy as np
+# Load BERT tokenizer and model
+bert_model_name = "bert-base-uncased"
+tokenizer = BertTokenizer.from_pretrained(bert_model_name)
+model = BertModel.from_pretrained(bert_model_name)
+model.eval()  # Set to evaluation mode
+# Function to obtain BERT embeddings
+def get_bert_embeddings(texts):
+    """Obtain BERT embeddings for a list of texts."""
+    embeddings = []
+    with torch.no_grad():
+        for text in texts:
+            inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
+            outputs = model(**inputs)
+            # Take the mean of token embeddings as the sentence embedding
+            embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
+            embeddings.append(embedding)
+    return np.array(embeddings)
+# Compute similarity matrices over embeddings
+def compute_similarity(embeddings1, embeddings2):
+    """Compute pairwise cosine similarity between two sets of embeddings."""
+    return cosine_similarity(embeddings1, embeddings2)
+# Compare a paragraph with a list of other paragraphs
+def compare_summaries(paragraph, paragraphs):
+    """
+    Compare a single paragraph with a list of summaries,
+    and return the most similar summary along with the similarity score.
+    """
+    # Get embeddings for the paragraph and the list of summaries
+    paragraph_embedding = get_bert_embeddings([paragraph])[0]  # Single paragraph embedding
+    summaries_embeddings = get_bert_embeddings(paragraphs)      # Embeddings for list of paragraphs
+    # Compute similarity between the paragraph and each summary
+    similarities = compute_similarity([paragraph_embedding], summaries_embeddings)[0]
+    # Find the most similar summary
+    most_similar_index = np.argmax(similarities)               # Get index of most similar summary
+    most_similar_summary = summaries[most_similar_index]       # Corresponding summary
+    similarity_score = similarities[most_similar_index]        # Similarity score
+    return most_similar_summary, similarity_score