Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 75 additions & 0 deletions test_fake_news.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from scipy.sparse import hstack

# -----------------------
# UTILITY FUNCTIONS
# -----------------------
def compute_speaker_scores(df, speaker_col='speaker', label_col='label'):
speaker_counts = df.groupby(speaker_col)[label_col].agg(['mean', 'count'])
smoothing = 2 # pseudo-count to avoid extremes
speaker_counts['score'] = ((speaker_counts['mean'] * speaker_counts['count'] + 0.5 * smoothing)
/ (speaker_counts['count'] + smoothing))
return speaker_counts['score'].to_dict()

def get_speaker_score_dynamic(speaker_name, speaker_scores):
return speaker_scores.get(speaker_name, 0.5) # default 0.5 if unknown

# -----------------------
# SAMPLE DATASET
# -----------------------
data = {
'text': [
"Breaking news: Market hits record high",
"Aliens landed in New York City",
"New study shows coffee improves memory",
"Chocolate cures all diseases",
"Local team wins championship",
"Government hiding the truth about UFOs",
"Scientists discover new species of bird",
"Miracle weight loss pills exposed"
],
'label': [1,0,1,0,1,0,1,0],
'speaker': ["Alice Smith","John Doe","Alice Smith","John Doe",
"Bob Lee","Jane Roe","Bob Lee","Jane Roe"]
}

# Convert to DataFrame
df = pd.DataFrame(data)

# -----------------------
# COMPUTE SPEAKER CREDIBILITY
# -----------------------
speaker_scores = compute_speaker_scores(df)
df['speaker_score'] = df['speaker'].apply(lambda x: get_speaker_score_dynamic(x, speaker_scores))

# Print speaker scores for verification
print("Speaker Credibility Scores:")
for speaker, score in speaker_scores.items():
print(f"{speaker}: {score:.2f}")

# -----------------------
# TEXT FEATURE EXTRACTION
# -----------------------
vectorizer = TfidfVectorizer(max_features=50)
text_features = vectorizer.fit_transform(df['text'])

# Combine text features with speaker credibility
speaker_features = df['speaker_score'].values.reshape(-1,1)
X = hstack([text_features, speaker_features])
y = df['label']

# -----------------------
# TRAIN TEST SPLIT AND MODEL
# -----------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
clf = RandomForestClassifier(n_estimators=50, random_state=42)
clf.fit(X_train, y_train)

# -----------------------
# EVALUATION
# -----------------------
accuracy = clf.score(X_test, y_test)
print(f"Test Accuracy: {accuracy:.2f}")
78 changes: 78 additions & 0 deletions train.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# train.py
# Training script for Fake News Detection using LIAR dataset
# Adds dynamic speaker credibility as an additional feature

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from scipy.sparse import hstack # to combine sparse matrices
from utils import compute_speaker_scores, get_speaker_score_dynamic

print("✅ Starting train.py")

# Step 1: Load LIAR dataset (TSV) without headers
df = pd.read_csv('liar_dataset/train.tsv', sep='\t', header=None)

# Step 2: Inspect first row and number of columns
print("Number of columns detected:", len(df.columns))
print("First row sample:", df.iloc[0].tolist())

# Step 3: Assign column names dynamically based on number of columns
# Default LIAR column names (15 columns)
default_cols = [
'id','label','statement','subject','speaker','job','state','party',
'barely-true','false','half-true','mostly-true','true','pants-fire','context'
]

# Keep only as many names as the file has columns
df.columns = default_cols[:len(df.columns)]

# Step 4: Keep only the columns we need
# If your dataset does not have 'speaker', create a placeholder
if 'speaker' not in df.columns:
df['speaker'] = 'Unknown'

df = df[['statement', 'label', 'speaker']]
df.rename(columns={'statement':'text'}, inplace=True)

# Step 5: Map original LIAR labels to binary
true_labels = ['true', 'mostly-true', 'half-true']
df['label'] = df['label'].apply(lambda x: 1 if x in true_labels else 0)

# Step 6: Fill missing text
df['text'] = df['text'].fillna("")

print(f"✅ Loaded dataset with {len(df)} rows")

# Step 7: Compute dynamic speaker credibility scores
speaker_scores = compute_speaker_scores(df, speaker_col='speaker', label_col='label')
print("✅ Speaker scores computed")

# Step 8: Convert text into TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000)
text_features = vectorizer.fit_transform(df['text'])
print("✅ TF-IDF complete")

# Step 9: Generate speaker credibility feature
speaker_features = df['speaker'].apply(
lambda x: get_speaker_score_dynamic(x, speaker_scores)
).values.reshape(-1, 1)

# Step 10: Combine text features with speaker credibility
X = hstack([text_features, speaker_features])
y = df['label']

# Step 11: Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)

# Step 12: Train the Random Forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
print("✅ Model trained")

# Step 13: Evaluate the model
accuracy = clf.score(X_test, y_test)
print(f"✅ Model Accuracy: {accuracy:.4f}")
45 changes: 45 additions & 0 deletions utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# utils.py
# Utility functions for speaker credibility score

import pandas as pd

def compute_speaker_scores(df, speaker_col='speaker', label_col='label'):
"""
Computes credibility scores for each speaker based on historical article labels.

Parameters:
- df: pandas DataFrame containing the news dataset
- speaker_col: column name containing speaker/author names
- label_col: column name containing article labels (1=real, 0=fake)

Returns:
- speaker_scores: dictionary {speaker_name: credibility_score}
"""
# Group by speaker and calculate mean label (fraction of real articles) and count of articles
speaker_counts = df.groupby(speaker_col)[label_col].agg(['mean', 'count'])

# Apply smoothing to avoid extreme scores for speakers with very few articles
smoothing = 2 # pseudo-count
# Compute smoothed credibility score: blends speaker's mean with neutral 0.5
speaker_counts['score'] = (
(speaker_counts['mean'] * speaker_counts['count'] + 0.5 * smoothing)
/ (speaker_counts['count'] + smoothing)
)

# Convert the result to a dictionary: {speaker_name: credibility_score}
speaker_scores = speaker_counts['score'].to_dict()
return speaker_scores

def get_speaker_score_dynamic(speaker_name, speaker_scores):
"""
Fetches the credibility score for a given speaker.

Parameters:
- speaker_name: name of the speaker/author
- speaker_scores: dictionary from compute_speaker_scores

Returns:
- credibility score (0 to 1)
- defaults to 0.5 if speaker is unknown
"""
return speaker_scores.get(speaker_name, 0.5)