Clustering Similar Headlines
Link to the GitHub repository
đź§ Clustering Similar Headlines Using Word Embeddings
📌 What I Did
The script clusters together similar news headlines based on their semantic meaning using Word2Vec embeddings and cosine similarity.
🔍 How I Did It
1. Load the Dataset
- Used
pandas
to read a CSV fileheadlines.csv
with aTitle
column containing the news headlines.
import pandas as pd
import numpy as np
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
# Load the headlines from CSV file
data = pd.read_csv('headlines.csv')
titles = data['Title'].tolist()
2. Tokenize the Text
- Used
RegexpTokenizer
fromnltk
to split headlines into word tokens, removing punctuation.
# Initialize tokenizer
tknzr = RegexpTokenizer(r'\w+')
# Tokenize all headlines
tokens = []
for title in titles:
tokens.append(tknzr.tokenize(title.lower()))
3. Remove Stop Words
- Used NLTK’s predefined list of English stop words.
- Removed these stop words from each tokenized headline to reduce noise and improve model quality.
# Get English stop words
stop_words = set(stopwords.words('english'))
# Remove stop words from tokens
tokens_no_stop = []
for i in range(len(tokens)):
tokens_no_stop.append([w for w in tokens[i] if w not in stop_words])
4. Train a Word2Vec Model
- Trained a Word2Vec model using Gensim on the tokenized, stop word–free headlines.
- Used
min_count=1
to include all words, ensuring even infrequent terms are embedded.
# Train Word2Vec model
word_vectors = Word2Vec(tokens_no_stop, min_count=1, vector_size=100)
5. Create Sentence Vectors
- For each headline, created a sentence vector by summing the Word2Vec embeddings of each word in the headline.
- Initialized a zero vector of length 100 (default Word2Vec vector size), and accumulated word vectors.
def create_sentence_vectors(tokenized_sentences, word2vec_model):
sentence_vectors = []
for sentence in tokenized_sentences:
# Initialize empty vector with correct dimensions
sentence_vector = np.zeros(word2vec_model.vector_size)
word_count = 0
# Add vectors for each word in the sentence
for word in sentence:
if word in word2vec_model.wv:
sentence_vector += word2vec_model.wv[word]
word_count += 1
# Average the vectors if there are words in the sentence
if word_count > 0:
sentence_vector /= word_count
sentence_vectors.append(sentence_vector)
return sentence_vectors
# Create sentence vectors
sentence_vectors = create_sentence_vectors(tokens_no_stop, word_vectors)
6. Compute Cosine Similarity
- Used
cosine_similarity
fromsklearn.metrics.pairwise
to measure similarity between sentence vectors. - Created a similarity matrix for all headlines.
def compute_similarity_matrix(sentence_vectors):
# Convert list to numpy array for cosine_similarity function
vectors_array = np.array(sentence_vectors)
# Compute cosine similarity matrix
similarity_matrix = cosine_similarity(vectors_array)
return similarity_matrix
# Compute similarity scores
similarity_matrix = compute_similarity_matrix(sentence_vectors)
7. Cluster Similar Headlines
- Iterated over each headline to form clusters based on semantic similarity:
- If it’s the first headline, it starts its own cluster.
- For every other headline, it is compared with all earlier headlines.
- If its cosine similarity score with any previous headline is ≥ 0.9, it’s added to that cluster.
- If no similar headline is found, a new cluster is created.
def cluster_headlines(titles, similarity_matrix, threshold=0.9):
clusters = [[] for _ in range(len(titles))]
cluster_indices = [] # Track which cluster each headline belongs to
for i in range(len(titles)):
if i == 0:
# First headline starts its own cluster
clusters[0].append(titles[i])
cluster_indices.append(0)
else:
added = False
# Check similarity with headlines in existing clusters
for j in range(i):
if similarity_matrix[i][j] >= threshold:
# Add to existing cluster
cluster_idx = cluster_indices[j]
clusters[cluster_idx].append(titles[i])
cluster_indices.append(cluster_idx)
added = True
break
if not added:
# Create new cluster
clusters[i].append(titles[i])
cluster_indices.append(i)
# Remove empty clusters
return [cluster for cluster in clusters if cluster]
# Cluster headlines
headline_clusters = cluster_headlines(titles, similarity_matrix)
# Print the results
print(f"Found {len(headline_clusters)} clusters")
for i, cluster in enumerate(headline_clusters):
print(f"\nCluster {i+1} - {len(cluster)} headlines:")
for headline in cluster:
print(f" • {headline}")
📊 Complete Example
import pandas as pd
import numpy as np
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
# 1. Load the dataset
data = pd.read_csv('headlines.csv')
titles = data['Title'].tolist()
# 2. Tokenize the text
tknzr = RegexpTokenizer(r'\w+')
tokens = []
for title in titles:
tokens.append(tknzr.tokenize(title.lower()))
# 3. Remove stop words
stop_words = set(stopwords.words('english'))
tokens_no_stop = []
for i in range(len(tokens)):
tokens_no_stop.append([w for w in tokens[i] if w not in stop_words])
# 4. Train a Word2Vec model
word_vectors = Word2Vec(tokens_no_stop, min_count=1, vector_size=100)
# 5. Create sentence vectors
def create_sentence_vectors(tokenized_sentences, word2vec_model):
sentence_vectors = []
for sentence in tokenized_sentences:
sentence_vector = np.zeros(word2vec_model.vector_size)
word_count = 0
for word in sentence:
if word in word2vec_model.wv:
sentence_vector += word2vec_model.wv[word]
word_count += 1
if word_count > 0:
sentence_vector /= word_count
sentence_vectors.append(sentence_vector)
return sentence_vectors
sentence_vectors = create_sentence_vectors(tokens_no_stop, word_vectors)
# 6. Compute cosine similarity
similarity_matrix = cosine_similarity(np.array(sentence_vectors))
# 7. Cluster similar headlines
def cluster_headlines(titles, similarity_matrix, threshold=0.9):
clusters = [[] for _ in range(len(titles))]
cluster_indices = []
for i in range(len(titles)):
if i == 0:
clusters[0].append(titles[i])
cluster_indices.append(0)
else:
added = False
for j in range(i):
if similarity_matrix[i][j] >= threshold:
cluster_idx = cluster_indices[j]
clusters[cluster_idx].append(titles[i])
cluster_indices.append(cluster_idx)
added = True
break
if not added:
clusters[i].append(titles[i])
cluster_indices.append(i)
return [cluster for cluster in clusters if cluster]
headline_clusters = cluster_headlines(titles, similarity_matrix)
# Print results
print(f"Found {len(headline_clusters)} clusters")
for i, cluster in enumerate(headline_clusters):
print(f"\nCluster {i+1} - {len(cluster)} headlines:")
for headline in cluster:
print(f" • {headline}")