From 274c538b9541831c46118c13715643dd5e877507 Mon Sep 17 00:00:00 2001
From: emmaltaylor <emmaltaaylor@gmail.com>
Date: Mon, 24 Nov 2025 13:14:19 -0600
Subject: [PATCH 1/2] Add speaker credbility feature and test script

---
 test_fake_news.py | 75 +++++++++++++++++++++++++++++++++++++++++++++++
 train.py          | 50 +++++++++++++++++++++++++++++++
 utils.py          | 45 ++++++++++++++++++++++++++++
 3 files changed, 170 insertions(+)
 create mode 100644 test_fake_news.py
 create mode 100644 train.py
 create mode 100644 utils.py

diff --git a/test_fake_news.py b/test_fake_news.py
new file mode 100644
index 0000000..7c64707
--- /dev/null
+++ b/test_fake_news.py
@@ -0,0 +1,75 @@
+import pandas as pd
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.model_selection import train_test_split
+from sklearn.ensemble import RandomForestClassifier
+from scipy.sparse import hstack
+
+# -----------------------
+# UTILITY FUNCTIONS
+# -----------------------
+def compute_speaker_scores(df, speaker_col='speaker', label_col='label'):
+    speaker_counts = df.groupby(speaker_col)[label_col].agg(['mean', 'count'])
+    smoothing = 2  # pseudo-count to avoid extremes
+    speaker_counts['score'] = ((speaker_counts['mean'] * speaker_counts['count'] + 0.5 * smoothing) 
+                               / (speaker_counts['count'] + smoothing))
+    return speaker_counts['score'].to_dict()
+
+def get_speaker_score_dynamic(speaker_name, speaker_scores):
+    return speaker_scores.get(speaker_name, 0.5)  # default 0.5 if unknown
+
+# -----------------------
+# SAMPLE DATASET
+# -----------------------
+data = {
+    'text': [
+        "Breaking news: Market hits record high",
+        "Aliens landed in New York City",
+        "New study shows coffee improves memory",
+        "Chocolate cures all diseases",
+        "Local team wins championship",
+        "Government hiding the truth about UFOs",
+        "Scientists discover new species of bird",
+        "Miracle weight loss pills exposed"
+    ],
+    'label': [1,0,1,0,1,0,1,0],
+    'speaker': ["Alice Smith","John Doe","Alice Smith","John Doe",
+                "Bob Lee","Jane Roe","Bob Lee","Jane Roe"]
+}
+
+# Convert to DataFrame
+df = pd.DataFrame(data)
+
+# -----------------------
+# COMPUTE SPEAKER CREDIBILITY
+# -----------------------
+speaker_scores = compute_speaker_scores(df)
+df['speaker_score'] = df['speaker'].apply(lambda x: get_speaker_score_dynamic(x, speaker_scores))
+
+# Print speaker scores for verification
+print("Speaker Credibility Scores:")
+for speaker, score in speaker_scores.items():
+    print(f"{speaker}: {score:.2f}")
+
+# -----------------------
+# TEXT FEATURE EXTRACTION
+# -----------------------
+vectorizer = TfidfVectorizer(max_features=50)
+text_features = vectorizer.fit_transform(df['text'])
+
+# Combine text features with speaker credibility
+speaker_features = df['speaker_score'].values.reshape(-1,1)
+X = hstack([text_features, speaker_features])
+y = df['label']
+
+# -----------------------
+# TRAIN TEST SPLIT AND MODEL
+# -----------------------
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
+clf = RandomForestClassifier(n_estimators=50, random_state=42)
+clf.fit(X_train, y_train)
+
+# -----------------------
+# EVALUATION
+# -----------------------
+accuracy = clf.score(X_test, y_test)
+print(f"Test Accuracy: {accuracy:.2f}")
diff --git a/train.py b/train.py
new file mode 100644
index 0000000..be25572
--- /dev/null
+++ b/train.py
@@ -0,0 +1,50 @@
+# train.py
+# Training script for Fake News Detection
+# Adds dynamic speaker credibility as an additional feature
+
+import pandas as pd
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.model_selection import train_test_split
+from sklearn.ensemble import RandomForestClassifier
+from scipy.sparse import hstack  # to combine sparse matrices
+from utils import compute_speaker_scores, get_speaker_score_dynamic
+
+# Step 1: Load dataset
+# Assume CSV has columns: 'text', 'label', 'speaker'
+df = pd.read_csv('data/news_data.csv')
+
+# Step 2: Compute speaker credibility scores dynamically
+# This creates a dictionary {speaker_name: score} based on historical labels
+speaker_scores = compute_speaker_scores(df, speaker_col='speaker', label_col='label')
+
+# Step 3: Convert text into TF-IDF features
+# max_features limits the number of features for efficiency
+vectorizer = TfidfVectorizer(max_features=5000)
+text_features = vectorizer.fit_transform(df['text'])
+
+# Step 4: Generate speaker credibility feature
+# Apply the dynamic score function to each row's speaker
+speaker_features = df['speaker'].apply(
+    lambda x: get_speaker_score_dynamic(x, speaker_scores)
+).values.reshape(-1, 1)  # reshape to 2D array for stacking
+
+# Step 5: Combine text features with speaker credibility
+# hstack allows us to combine sparse text matrix with dense speaker feature
+X = hstack([text_features, speaker_features])
+
+# Labels (1=real, 0=fake)
+y = df['label']
+
+# Step 6: Split data into training and test sets
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, test_size=0.2, random_state=42
+)
+
+# Step 7: Train the classifier
+# Using Random Forest; you can replace with other models if desired
+clf = RandomForestClassifier(n_estimators=100, random_state=42)
+clf.fit(X_train, y_train)
+
+# Step 8: Evaluate the model
+accuracy = clf.score(X_test, y_test)
+print(f"Model Accuracy: {accuracy:.4f}")
diff --git a/utils.py b/utils.py
new file mode 100644
index 0000000..2b0a47a
--- /dev/null
+++ b/utils.py
@@ -0,0 +1,45 @@
+# utils.py
+# Utility functions for speaker credibility score
+
+import pandas as pd
+
+def compute_speaker_scores(df, speaker_col='speaker', label_col='label'):
+    """
+    Computes credibility scores for each speaker based on historical article labels.
+    
+    Parameters:
+    - df: pandas DataFrame containing the news dataset
+    - speaker_col: column name containing speaker/author names
+    - label_col: column name containing article labels (1=real, 0=fake)
+    
+    Returns:
+    - speaker_scores: dictionary {speaker_name: credibility_score}
+    """
+    # Group by speaker and calculate mean label (fraction of real articles) and count of articles
+    speaker_counts = df.groupby(speaker_col)[label_col].agg(['mean', 'count'])
+    
+    # Apply smoothing to avoid extreme scores for speakers with very few articles
+    smoothing = 2  # pseudo-count
+    # Compute smoothed credibility score: blends speaker's mean with neutral 0.5
+    speaker_counts['score'] = (
+        (speaker_counts['mean'] * speaker_counts['count'] + 0.5 * smoothing) 
+        / (speaker_counts['count'] + smoothing)
+    )
+    
+    # Convert the result to a dictionary: {speaker_name: credibility_score}
+    speaker_scores = speaker_counts['score'].to_dict()
+    return speaker_scores
+
+def get_speaker_score_dynamic(speaker_name, speaker_scores):
+    """
+    Fetches the credibility score for a given speaker.
+    
+    Parameters:
+    - speaker_name: name of the speaker/author
+    - speaker_scores: dictionary from compute_speaker_scores
+    
+    Returns:
+    - credibility score (0 to 1)
+    - defaults to 0.5 if speaker is unknown
+    """
+    return speaker_scores.get(speaker_name, 0.5)

From 712f37f6631042ff909e0a312890725bad87ce93 Mon Sep 17 00:00:00 2001
From: emmaltaylor <emmaltaaylor@gmail.com>
Date: Sat, 29 Nov 2025 19:33:38 -0600
Subject: [PATCH 2/2] Update train.py with LIAR dataset fixes

---
 train.py | 68 +++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 48 insertions(+), 20 deletions(-)

diff --git a/train.py b/train.py
index be25572..079963c 100644
--- a/train.py
+++ b/train.py
@@ -1,5 +1,5 @@
 # train.py
-# Training script for Fake News Detection
+# Training script for Fake News Detection using LIAR dataset
 # Adds dynamic speaker credibility as an additional feature
 
 import pandas as pd
@@ -9,42 +9,70 @@
 from scipy.sparse import hstack  # to combine sparse matrices
 from utils import compute_speaker_scores, get_speaker_score_dynamic
 
-# Step 1: Load dataset
-# Assume CSV has columns: 'text', 'label', 'speaker'
-df = pd.read_csv('data/news_data.csv')
+print("✅ Starting train.py")
 
-# Step 2: Compute speaker credibility scores dynamically
-# This creates a dictionary {speaker_name: score} based on historical labels
+# Step 1: Load LIAR dataset (TSV) without headers
+df = pd.read_csv('liar_dataset/train.tsv', sep='\t', header=None)
+
+# Step 2: Inspect first row and number of columns
+print("Number of columns detected:", len(df.columns))
+print("First row sample:", df.iloc[0].tolist())
+
+# Step 3: Assign column names dynamically based on number of columns
+# Default LIAR column names (15 columns)
+default_cols = [
+    'id','label','statement','subject','speaker','job','state','party',
+    'barely-true','false','half-true','mostly-true','true','pants-fire','context'
+]
+
+# Keep only as many names as the file has columns
+df.columns = default_cols[:len(df.columns)]
+
+# Step 4: Keep only the columns we need
+# If your dataset does not have 'speaker', create a placeholder
+if 'speaker' not in df.columns:
+    df['speaker'] = 'Unknown'
+
+df = df[['statement', 'label', 'speaker']]
+df.rename(columns={'statement':'text'}, inplace=True)
+
+# Step 5: Map original LIAR labels to binary
+true_labels = ['true', 'mostly-true', 'half-true']
+df['label'] = df['label'].apply(lambda x: 1 if x in true_labels else 0)
+
+# Step 6: Fill missing text
+df['text'] = df['text'].fillna("")
+
+print(f"✅ Loaded dataset with {len(df)} rows")
+
+# Step 7: Compute dynamic speaker credibility scores
 speaker_scores = compute_speaker_scores(df, speaker_col='speaker', label_col='label')
+print("✅ Speaker scores computed")
 
-# Step 3: Convert text into TF-IDF features
-# max_features limits the number of features for efficiency
+# Step 8: Convert text into TF-IDF features
 vectorizer = TfidfVectorizer(max_features=5000)
 text_features = vectorizer.fit_transform(df['text'])
+print("✅ TF-IDF complete")
 
-# Step 4: Generate speaker credibility feature
-# Apply the dynamic score function to each row's speaker
+# Step 9: Generate speaker credibility feature
 speaker_features = df['speaker'].apply(
     lambda x: get_speaker_score_dynamic(x, speaker_scores)
-).values.reshape(-1, 1)  # reshape to 2D array for stacking
+).values.reshape(-1, 1)
 
-# Step 5: Combine text features with speaker credibility
-# hstack allows us to combine sparse text matrix with dense speaker feature
+# Step 10: Combine text features with speaker credibility
 X = hstack([text_features, speaker_features])
-
-# Labels (1=real, 0=fake)
 y = df['label']
 
-# Step 6: Split data into training and test sets
+# Step 11: Split data into training and test sets
 X_train, X_test, y_train, y_test = train_test_split(
     X, y, test_size=0.2, random_state=42
 )
 
-# Step 7: Train the classifier
-# Using Random Forest; you can replace with other models if desired
+# Step 12: Train the Random Forest classifier
 clf = RandomForestClassifier(n_estimators=100, random_state=42)
 clf.fit(X_train, y_train)
+print("✅ Model trained")
 
-# Step 8: Evaluate the model
+# Step 13: Evaluate the model
 accuracy = clf.score(X_test, y_test)
-print(f"Model Accuracy: {accuracy:.4f}")
+print(f"✅ Model Accuracy: {accuracy:.4f}")