nishitpatel01 · emmaltaylor · Nov 24, 2025 · Nov 30, 2025
diff --git a/test_fake_news.py b/test_fake_news.py
@@ -0,0 +1,75 @@
+import pandas as pd
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.model_selection import train_test_split
+from sklearn.ensemble import RandomForestClassifier
+from scipy.sparse import hstack
+
+# -----------------------
+# UTILITY FUNCTIONS
+# -----------------------
+def compute_speaker_scores(df, speaker_col='speaker', label_col='label'):
+    speaker_counts = df.groupby(speaker_col)[label_col].agg(['mean', 'count'])
+    smoothing = 2  # pseudo-count to avoid extremes
+    speaker_counts['score'] = ((speaker_counts['mean'] * speaker_counts['count'] + 0.5 * smoothing) 
+                               / (speaker_counts['count'] + smoothing))
+    return speaker_counts['score'].to_dict()
+
+def get_speaker_score_dynamic(speaker_name, speaker_scores):
+    return speaker_scores.get(speaker_name, 0.5)  # default 0.5 if unknown
+
+# -----------------------
+# SAMPLE DATASET
+# -----------------------
+data = {
+    'text': [
+        "Breaking news: Market hits record high",
+        "Aliens landed in New York City",
+        "New study shows coffee improves memory",
+        "Chocolate cures all diseases",
+        "Local team wins championship",
+        "Government hiding the truth about UFOs",
+        "Scientists discover new species of bird",
+        "Miracle weight loss pills exposed"
+    ],
+    'label': [1,0,1,0,1,0,1,0],
+    'speaker': ["Alice Smith","John Doe","Alice Smith","John Doe",
+                "Bob Lee","Jane Roe","Bob Lee","Jane Roe"]
+}
+
+# Convert to DataFrame
+df = pd.DataFrame(data)
+
+# -----------------------
+# COMPUTE SPEAKER CREDIBILITY
+# -----------------------
+speaker_scores = compute_speaker_scores(df)
+df['speaker_score'] = df['speaker'].apply(lambda x: get_speaker_score_dynamic(x, speaker_scores))
+
+# Print speaker scores for verification
+print("Speaker Credibility Scores:")
+for speaker, score in speaker_scores.items():
+    print(f"{speaker}: {score:.2f}")
+
+# -----------------------
+# TEXT FEATURE EXTRACTION
+# -----------------------
+vectorizer = TfidfVectorizer(max_features=50)
+text_features = vectorizer.fit_transform(df['text'])
+
+# Combine text features with speaker credibility
+speaker_features = df['speaker_score'].values.reshape(-1,1)
+X = hstack([text_features, speaker_features])
+y = df['label']
+
+# -----------------------
+# TRAIN TEST SPLIT AND MODEL
+# -----------------------
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
+clf = RandomForestClassifier(n_estimators=50, random_state=42)
+clf.fit(X_train, y_train)
+
+# -----------------------
+# EVALUATION
+# -----------------------
+accuracy = clf.score(X_test, y_test)
+print(f"Test Accuracy: {accuracy:.2f}")
diff --git a/train.py b/train.py
@@ -0,0 +1,78 @@
+# train.py
+# Training script for Fake News Detection using LIAR dataset
+# Adds dynamic speaker credibility as an additional feature
+
+import pandas as pd
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.model_selection import train_test_split
+from sklearn.ensemble import RandomForestClassifier
+from scipy.sparse import hstack  # to combine sparse matrices
+from utils import compute_speaker_scores, get_speaker_score_dynamic
+
+print("✅ Starting train.py")
+
+# Step 1: Load LIAR dataset (TSV) without headers
+df = pd.read_csv('liar_dataset/train.tsv', sep='\t', header=None)
+
+# Step 2: Inspect first row and number of columns
+print("Number of columns detected:", len(df.columns))
+print("First row sample:", df.iloc[0].tolist())
+
+# Step 3: Assign column names dynamically based on number of columns
+# Default LIAR column names (15 columns)
+default_cols = [
+    'id','label','statement','subject','speaker','job','state','party',
+    'barely-true','false','half-true','mostly-true','true','pants-fire','context'
+]
+
+# Keep only as many names as the file has columns
+df.columns = default_cols[:len(df.columns)]
+
+# Step 4: Keep only the columns we need
+# If your dataset does not have 'speaker', create a placeholder
+if 'speaker' not in df.columns:
+    df['speaker'] = 'Unknown'
+
+df = df[['statement', 'label', 'speaker']]
+df.rename(columns={'statement':'text'}, inplace=True)
+
+# Step 5: Map original LIAR labels to binary
+true_labels = ['true', 'mostly-true', 'half-true']
+df['label'] = df['label'].apply(lambda x: 1 if x in true_labels else 0)
+
+# Step 6: Fill missing text
+df['text'] = df['text'].fillna("")
+
+print(f"✅ Loaded dataset with {len(df)} rows")
+
+# Step 7: Compute dynamic speaker credibility scores
+speaker_scores = compute_speaker_scores(df, speaker_col='speaker', label_col='label')
+print("✅ Speaker scores computed")
+
+# Step 8: Convert text into TF-IDF features
+vectorizer = TfidfVectorizer(max_features=5000)
+text_features = vectorizer.fit_transform(df['text'])
+print("✅ TF-IDF complete")
+
+# Step 9: Generate speaker credibility feature
+speaker_features = df['speaker'].apply(
+    lambda x: get_speaker_score_dynamic(x, speaker_scores)
+).values.reshape(-1, 1)
+
+# Step 10: Combine text features with speaker credibility
+X = hstack([text_features, speaker_features])
+y = df['label']
+
+# Step 11: Split data into training and test sets
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, test_size=0.2, random_state=42
+)
+
+# Step 12: Train the Random Forest classifier
+clf = RandomForestClassifier(n_estimators=100, random_state=42)
+clf.fit(X_train, y_train)
+print("✅ Model trained")
+
+# Step 13: Evaluate the model
+accuracy = clf.score(X_test, y_test)
+print(f"✅ Model Accuracy: {accuracy:.4f}")
diff --git a/utils.py b/utils.py
@@ -0,0 +1,45 @@
+# utils.py
+# Utility functions for speaker credibility score
+
+import pandas as pd
+
+def compute_speaker_scores(df, speaker_col='speaker', label_col='label'):
+    """
+    Computes credibility scores for each speaker based on historical article labels.
+
+    Parameters:
+    - df: pandas DataFrame containing the news dataset
+    - speaker_col: column name containing speaker/author names
+    - label_col: column name containing article labels (1=real, 0=fake)
+
+    Returns:
+    - speaker_scores: dictionary {speaker_name: credibility_score}
+    """
+    # Group by speaker and calculate mean label (fraction of real articles) and count of articles
+    speaker_counts = df.groupby(speaker_col)[label_col].agg(['mean', 'count'])
+
+    # Apply smoothing to avoid extreme scores for speakers with very few articles
+    smoothing = 2  # pseudo-count
+    # Compute smoothed credibility score: blends speaker's mean with neutral 0.5
+    speaker_counts['score'] = (
+        (speaker_counts['mean'] * speaker_counts['count'] + 0.5 * smoothing) 
+        / (speaker_counts['count'] + smoothing)
+    )
+
+    # Convert the result to a dictionary: {speaker_name: credibility_score}
+    speaker_scores = speaker_counts['score'].to_dict()
+    return speaker_scores
+
+def get_speaker_score_dynamic(speaker_name, speaker_scores):
+    """
+    Fetches the credibility score for a given speaker.
+
+    Parameters:
+    - speaker_name: name of the speaker/author
+    - speaker_scores: dictionary from compute_speaker_scores
+
+    Returns:
+    - credibility score (0 to 1)
+    - defaults to 0.5 if speaker is unknown
+    """
+    return speaker_scores.get(speaker_name, 0.5)