From 274c538b9541831c46118c13715643dd5e877507 Mon Sep 17 00:00:00 2001 From: emmaltaylor Date: Mon, 24 Nov 2025 13:14:19 -0600 Subject: [PATCH 1/2] Add speaker credbility feature and test script --- test_fake_news.py | 75 +++++++++++++++++++++++++++++++++++++++++++++++ train.py | 50 +++++++++++++++++++++++++++++++ utils.py | 45 ++++++++++++++++++++++++++++ 3 files changed, 170 insertions(+) create mode 100644 test_fake_news.py create mode 100644 train.py create mode 100644 utils.py diff --git a/test_fake_news.py b/test_fake_news.py new file mode 100644 index 0000000..7c64707 --- /dev/null +++ b/test_fake_news.py @@ -0,0 +1,75 @@ +import pandas as pd +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.model_selection import train_test_split +from sklearn.ensemble import RandomForestClassifier +from scipy.sparse import hstack + +# ----------------------- +# UTILITY FUNCTIONS +# ----------------------- +def compute_speaker_scores(df, speaker_col='speaker', label_col='label'): + speaker_counts = df.groupby(speaker_col)[label_col].agg(['mean', 'count']) + smoothing = 2 # pseudo-count to avoid extremes + speaker_counts['score'] = ((speaker_counts['mean'] * speaker_counts['count'] + 0.5 * smoothing) + / (speaker_counts['count'] + smoothing)) + return speaker_counts['score'].to_dict() + +def get_speaker_score_dynamic(speaker_name, speaker_scores): + return speaker_scores.get(speaker_name, 0.5) # default 0.5 if unknown + +# ----------------------- +# SAMPLE DATASET +# ----------------------- +data = { + 'text': [ + "Breaking news: Market hits record high", + "Aliens landed in New York City", + "New study shows coffee improves memory", + "Chocolate cures all diseases", + "Local team wins championship", + "Government hiding the truth about UFOs", + "Scientists discover new species of bird", + "Miracle weight loss pills exposed" + ], + 'label': [1,0,1,0,1,0,1,0], + 'speaker': ["Alice Smith","John Doe","Alice Smith","John Doe", + "Bob Lee","Jane Roe","Bob Lee","Jane Roe"] +} + +# Convert to DataFrame +df = pd.DataFrame(data) + +# ----------------------- +# COMPUTE SPEAKER CREDIBILITY +# ----------------------- +speaker_scores = compute_speaker_scores(df) +df['speaker_score'] = df['speaker'].apply(lambda x: get_speaker_score_dynamic(x, speaker_scores)) + +# Print speaker scores for verification +print("Speaker Credibility Scores:") +for speaker, score in speaker_scores.items(): + print(f"{speaker}: {score:.2f}") + +# ----------------------- +# TEXT FEATURE EXTRACTION +# ----------------------- +vectorizer = TfidfVectorizer(max_features=50) +text_features = vectorizer.fit_transform(df['text']) + +# Combine text features with speaker credibility +speaker_features = df['speaker_score'].values.reshape(-1,1) +X = hstack([text_features, speaker_features]) +y = df['label'] + +# ----------------------- +# TRAIN TEST SPLIT AND MODEL +# ----------------------- +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) +clf = RandomForestClassifier(n_estimators=50, random_state=42) +clf.fit(X_train, y_train) + +# ----------------------- +# EVALUATION +# ----------------------- +accuracy = clf.score(X_test, y_test) +print(f"Test Accuracy: {accuracy:.2f}") diff --git a/train.py b/train.py new file mode 100644 index 0000000..be25572 --- /dev/null +++ b/train.py @@ -0,0 +1,50 @@ +# train.py +# Training script for Fake News Detection +# Adds dynamic speaker credibility as an additional feature + +import pandas as pd +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.model_selection import train_test_split +from sklearn.ensemble import RandomForestClassifier +from scipy.sparse import hstack # to combine sparse matrices +from utils import compute_speaker_scores, get_speaker_score_dynamic + +# Step 1: Load dataset +# Assume CSV has columns: 'text', 'label', 'speaker' +df = pd.read_csv('data/news_data.csv') + +# Step 2: Compute speaker credibility scores dynamically +# This creates a dictionary {speaker_name: score} based on historical labels +speaker_scores = compute_speaker_scores(df, speaker_col='speaker', label_col='label') + +# Step 3: Convert text into TF-IDF features +# max_features limits the number of features for efficiency +vectorizer = TfidfVectorizer(max_features=5000) +text_features = vectorizer.fit_transform(df['text']) + +# Step 4: Generate speaker credibility feature +# Apply the dynamic score function to each row's speaker +speaker_features = df['speaker'].apply( + lambda x: get_speaker_score_dynamic(x, speaker_scores) +).values.reshape(-1, 1) # reshape to 2D array for stacking + +# Step 5: Combine text features with speaker credibility +# hstack allows us to combine sparse text matrix with dense speaker feature +X = hstack([text_features, speaker_features]) + +# Labels (1=real, 0=fake) +y = df['label'] + +# Step 6: Split data into training and test sets +X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.2, random_state=42 +) + +# Step 7: Train the classifier +# Using Random Forest; you can replace with other models if desired +clf = RandomForestClassifier(n_estimators=100, random_state=42) +clf.fit(X_train, y_train) + +# Step 8: Evaluate the model +accuracy = clf.score(X_test, y_test) +print(f"Model Accuracy: {accuracy:.4f}") diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..2b0a47a --- /dev/null +++ b/utils.py @@ -0,0 +1,45 @@ +# utils.py +# Utility functions for speaker credibility score + +import pandas as pd + +def compute_speaker_scores(df, speaker_col='speaker', label_col='label'): + """ + Computes credibility scores for each speaker based on historical article labels. + + Parameters: + - df: pandas DataFrame containing the news dataset + - speaker_col: column name containing speaker/author names + - label_col: column name containing article labels (1=real, 0=fake) + + Returns: + - speaker_scores: dictionary {speaker_name: credibility_score} + """ + # Group by speaker and calculate mean label (fraction of real articles) and count of articles + speaker_counts = df.groupby(speaker_col)[label_col].agg(['mean', 'count']) + + # Apply smoothing to avoid extreme scores for speakers with very few articles + smoothing = 2 # pseudo-count + # Compute smoothed credibility score: blends speaker's mean with neutral 0.5 + speaker_counts['score'] = ( + (speaker_counts['mean'] * speaker_counts['count'] + 0.5 * smoothing) + / (speaker_counts['count'] + smoothing) + ) + + # Convert the result to a dictionary: {speaker_name: credibility_score} + speaker_scores = speaker_counts['score'].to_dict() + return speaker_scores + +def get_speaker_score_dynamic(speaker_name, speaker_scores): + """ + Fetches the credibility score for a given speaker. + + Parameters: + - speaker_name: name of the speaker/author + - speaker_scores: dictionary from compute_speaker_scores + + Returns: + - credibility score (0 to 1) + - defaults to 0.5 if speaker is unknown + """ + return speaker_scores.get(speaker_name, 0.5) From 712f37f6631042ff909e0a312890725bad87ce93 Mon Sep 17 00:00:00 2001 From: emmaltaylor Date: Sat, 29 Nov 2025 19:33:38 -0600 Subject: [PATCH 2/2] Update train.py with LIAR dataset fixes --- train.py | 68 +++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 48 insertions(+), 20 deletions(-) diff --git a/train.py b/train.py index be25572..079963c 100644 --- a/train.py +++ b/train.py @@ -1,5 +1,5 @@ # train.py -# Training script for Fake News Detection +# Training script for Fake News Detection using LIAR dataset # Adds dynamic speaker credibility as an additional feature import pandas as pd @@ -9,42 +9,70 @@ from scipy.sparse import hstack # to combine sparse matrices from utils import compute_speaker_scores, get_speaker_score_dynamic -# Step 1: Load dataset -# Assume CSV has columns: 'text', 'label', 'speaker' -df = pd.read_csv('data/news_data.csv') +print("✅ Starting train.py") -# Step 2: Compute speaker credibility scores dynamically -# This creates a dictionary {speaker_name: score} based on historical labels +# Step 1: Load LIAR dataset (TSV) without headers +df = pd.read_csv('liar_dataset/train.tsv', sep='\t', header=None) + +# Step 2: Inspect first row and number of columns +print("Number of columns detected:", len(df.columns)) +print("First row sample:", df.iloc[0].tolist()) + +# Step 3: Assign column names dynamically based on number of columns +# Default LIAR column names (15 columns) +default_cols = [ + 'id','label','statement','subject','speaker','job','state','party', + 'barely-true','false','half-true','mostly-true','true','pants-fire','context' +] + +# Keep only as many names as the file has columns +df.columns = default_cols[:len(df.columns)] + +# Step 4: Keep only the columns we need +# If your dataset does not have 'speaker', create a placeholder +if 'speaker' not in df.columns: + df['speaker'] = 'Unknown' + +df = df[['statement', 'label', 'speaker']] +df.rename(columns={'statement':'text'}, inplace=True) + +# Step 5: Map original LIAR labels to binary +true_labels = ['true', 'mostly-true', 'half-true'] +df['label'] = df['label'].apply(lambda x: 1 if x in true_labels else 0) + +# Step 6: Fill missing text +df['text'] = df['text'].fillna("") + +print(f"✅ Loaded dataset with {len(df)} rows") + +# Step 7: Compute dynamic speaker credibility scores speaker_scores = compute_speaker_scores(df, speaker_col='speaker', label_col='label') +print("✅ Speaker scores computed") -# Step 3: Convert text into TF-IDF features -# max_features limits the number of features for efficiency +# Step 8: Convert text into TF-IDF features vectorizer = TfidfVectorizer(max_features=5000) text_features = vectorizer.fit_transform(df['text']) +print("✅ TF-IDF complete") -# Step 4: Generate speaker credibility feature -# Apply the dynamic score function to each row's speaker +# Step 9: Generate speaker credibility feature speaker_features = df['speaker'].apply( lambda x: get_speaker_score_dynamic(x, speaker_scores) -).values.reshape(-1, 1) # reshape to 2D array for stacking +).values.reshape(-1, 1) -# Step 5: Combine text features with speaker credibility -# hstack allows us to combine sparse text matrix with dense speaker feature +# Step 10: Combine text features with speaker credibility X = hstack([text_features, speaker_features]) - -# Labels (1=real, 0=fake) y = df['label'] -# Step 6: Split data into training and test sets +# Step 11: Split data into training and test sets X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42 ) -# Step 7: Train the classifier -# Using Random Forest; you can replace with other models if desired +# Step 12: Train the Random Forest classifier clf = RandomForestClassifier(n_estimators=100, random_state=42) clf.fit(X_train, y_train) +print("✅ Model trained") -# Step 8: Evaluate the model +# Step 13: Evaluate the model accuracy = clf.score(X_test, y_test) -print(f"Model Accuracy: {accuracy:.4f}") +print(f"✅ Model Accuracy: {accuracy:.4f}")