diff --git a/TypeTruth.egg-info/PKG-INFO b/TypeTruth.egg-info/PKG-INFO new file mode 100644 index 0000000..955805b --- /dev/null +++ b/TypeTruth.egg-info/PKG-INFO @@ -0,0 +1,89 @@ +Metadata-Version: 2.4 +Name: TypeTruth +Version: 0.1.0 +Summary: TypeTruth is a Python library that detects whether a text is written by a human or AI. Ideal for fact-checking and content validation in the age of AI content generators. Detect content generated by chatGPT, GPT4, GPT3.5, GPT3, Falcon, Vicuna, Stanford Alpaca, and LlAMA +Home-page: https://github.com/bhaskatripathi/TypeTruth +Author: Bhaskar Tripathi +Author-email: bhaskar.tripathi@gmail.com +Classifier: Development Status :: 3 - Alpha +Classifier: Intended Audience :: Developers +Classifier: License :: OSI Approved :: MIT License +Classifier: Programming Language :: Python :: 3.7 +Classifier: Programming Language :: Python :: 3.8 +Classifier: Programming Language :: Python :: 3.9 +Requires-Python: >=3.7 +Description-Content-Type: text/markdown +License-File: LICENSE.txt +Requires-Dist: requests +Requires-Dist: numpy +Requires-Dist: pandas +Requires-Dist: matplotlib +Requires-Dist: seaborn +Requires-Dist: torch +Requires-Dist: scikit-learn +Requires-Dist: transformers +Dynamic: author +Dynamic: author-email +Dynamic: classifier +Dynamic: description +Dynamic: description-content-type +Dynamic: home-page +Dynamic: license-file +Dynamic: requires-dist +Dynamic: requires-python +Dynamic: summary + +## Problem Statement: +- **Sophisticated language models** like OpenAI's GPT series, Falcon etc have blurred the lines between human-written and AI-generated text. +- **Distinguishing** between AI and human-generated content has become a complex task with crucial implications: +- **Information Validity**: AI-generated text may not always offer accurate or reliable information. +- **Authenticity**: Textual content is often used to gauge the knowledge, opinions, and expertise of its author. AI-generated content obscures such assessments. +- **Accountability**: In contexts where content can have serious consequences (e.g., news articles, legal documents), it's vital to identify its origin. + +# TypeTruth +TypeTruth is a Python library that detects whether a text is written by a human or AI. Ideal for fact-checking and content validation in the age of AI content generators. It offers AI Content Detection at Paragraph Level as well as Sentence Level. The solution also provides visualizations to better understand the detection results, such as bar plots and heat maps. + +# Sample Output: +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/bhaskatripathi/TypeTruth/blob/main/TypeTruth_Collab_Notebook.ipynb) + +### Paragraph Level: +![image](https://github.com/bhaskatripathi/TypeTruth/assets/35177508/981cc67d-6973-46ad-acdf-acc6d33fc4fc) +### Sentence Level: +![image](https://github.com/bhaskatripathi/TypeTruth/assets/35177508/3b95ab61-dfdd-4b73-89b0-fa6290c55b25) + +# UML +I am going to update the code to work with [Falcon](https://huggingface.co/spaces/HuggingFaceH4/falcon-chat), so you see the sequence diagram for Falcon. + +![UML Diagram](https://raw.githubusercontent.com/bhaskatripathi/TypeTruth/main/diagram.svg) + +# Free Usage using Bearer Key +## Bearer Key + +Either you can use your own OpenAI key or you can use a bearer key available for free. To obtain a bearer key, follow this procedure: + +1. Open [this URL](https://platform.openai.com/ai-text-classifier) in your browser. +2. Enter a 1000-word text, Submit and Right-click and select "Inspect" to open the developer tools. +3. Click on the "Network" tab. +4. Look for a POST request under the "Name" column in the list that appears. It should be related to "completions". +5. Click on the POST request and find the "Authorization" section under the "Headers" tab. +6. The bearer key is located in the "Authorization" section and it begins with the word "Bearer", as described in the image below. +![image](https://github.com/bhaskatripathi/TypeTruth/assets/35177508/9aa86989-0ea3-4d9b-a5be-43c5f0c5eea0) + +# Directory Structure +``` +ai_text_detector/ +|--- ai_text_detector/ +| |--- __init__.py +| |--- ai_detector.py +| |--- plotting.py +|--- setup.py +|--- TypeTruth_Collab_Notebook.ipynb +|--- README.md +|--- LICENSE.txt +``` +# Star +Note: Please star this project if you find it useful. +## Star History + +[![Star History Chart](https://api.star-history.com/svg?repos=bhaskatripathi/TypeTruth&type=Date)](https://star-history.com/#bhaskatripathi/TypeTruth&Date) + diff --git a/TypeTruth.egg-info/SOURCES.txt b/TypeTruth.egg-info/SOURCES.txt new file mode 100644 index 0000000..0fedafb --- /dev/null +++ b/TypeTruth.egg-info/SOURCES.txt @@ -0,0 +1,13 @@ +LICENSE.txt +MANIFEST.in +README.md +setup.py +TypeTruth/__init__.py +TypeTruth/aitextdetector.py +TypeTruth/plotting.py +TypeTruth.egg-info/PKG-INFO +TypeTruth.egg-info/SOURCES.txt +TypeTruth.egg-info/dependency_links.txt +TypeTruth.egg-info/requires.txt +TypeTruth.egg-info/top_level.txt +tests/test_aitextdetector.py \ No newline at end of file diff --git a/TypeTruth.egg-info/dependency_links.txt b/TypeTruth.egg-info/dependency_links.txt new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/TypeTruth.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/TypeTruth.egg-info/requires.txt b/TypeTruth.egg-info/requires.txt new file mode 100644 index 0000000..abdd677 --- /dev/null +++ b/TypeTruth.egg-info/requires.txt @@ -0,0 +1,8 @@ +requests +numpy +pandas +matplotlib +seaborn +torch +scikit-learn +transformers diff --git a/TypeTruth.egg-info/top_level.txt b/TypeTruth.egg-info/top_level.txt new file mode 100644 index 0000000..bdc04d9 --- /dev/null +++ b/TypeTruth.egg-info/top_level.txt @@ -0,0 +1 @@ +TypeTruth diff --git a/TypeTruth/__init__.py b/TypeTruth/__init__.py index 42179ba..2799b39 100644 --- a/TypeTruth/__init__.py +++ b/TypeTruth/__init__.py @@ -1,3 +1,2 @@ -from .detector import AIDetector -from .plotting import plot_bar_chart, plot_stacked_bar_chart, plot_heatmap - \ No newline at end of file +from .aitextdetector import AIDetector +from .plotting import plot_bar_chart, plot_stacked_bar_chart, plot_heatmap diff --git a/TypeTruth/__pycache__/__init__.cpython-311.pyc b/TypeTruth/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000..6b7ea7c Binary files /dev/null and b/TypeTruth/__pycache__/__init__.cpython-311.pyc differ diff --git a/__pycache__/ai_detector.cpython-311.pyc b/__pycache__/ai_detector.cpython-311.pyc new file mode 100644 index 0000000..c91cb4d Binary files /dev/null and b/__pycache__/ai_detector.cpython-311.pyc differ diff --git a/ai_detector.py b/ai_detector.py index 1314ac7..23b2fed 100644 --- a/ai_detector.py +++ b/ai_detector.py @@ -1 +1 @@ -from .ai_detector import AIDetector, ContentGeneratorChecker +from TypeTruth.ai_detector import AIDetector, ContentGeneratorChecker diff --git a/build/lib/TypeTruth/__init__.py b/build/lib/TypeTruth/__init__.py new file mode 100644 index 0000000..42179ba --- /dev/null +++ b/build/lib/TypeTruth/__init__.py @@ -0,0 +1,3 @@ +from .detector import AIDetector +from .plotting import plot_bar_chart, plot_stacked_bar_chart, plot_heatmap + \ No newline at end of file diff --git a/build/lib/TypeTruth/aitextdetector.py b/build/lib/TypeTruth/aitextdetector.py new file mode 100644 index 0000000..2ae29ff --- /dev/null +++ b/build/lib/TypeTruth/aitextdetector.py @@ -0,0 +1,113 @@ +import requests +import numpy as np +import pandas as pd +import re + +class ContentGeneratorChecker: + def __init__(self, token): + self.header = { + 'Accept': '*/*', + 'Accept-Language': 'en-US,en;q=0.9,hi;q=0.8', + 'Authorization': token, + 'Connection': 'keep-alive', + 'Content-Type': 'application/json', + 'Origin': 'https://platform.openai.com', + 'Referer': 'https://platform.openai.com/', + 'Sec-Fetch-Dest': 'empty', + 'Sec-Fetch-Mode': 'cors', + 'Sec-Fetch-Site': 'same-site', + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36', + 'sec-ch-ua': '"Not_A Brand";v="99", "Google Chrome";v="109", "Chromium";v="109"', + 'sec-ch-ua-mobile': '?0', + 'sec-ch-ua-platform': '"macOS"', + } + self.possible_classes = [ + 'Very unlikely to be AI-generated', + 'Unlikely to be AI-generated', + 'Cannot determine if AI-written or human-written', + 'Possibly AI-generated', + 'Likely AI-generated', + ] + self.class_max = [10, 45, 90, 98, 99] + + def detect(self, text, all_probs=False): + data = { + 'prompt': text + "ยป.\n", + 'max_tokens': 1, + 'temperature': 1, + 'top_p': 1, + 'n': 1, + 'logprobs': 5, + 'stop': '\n', + 'stream': False, + 'model': 'model-detect-v2', + } + try: + response = requests.post('https://api.openai.com/v1/completions', headers=self.header, json=data) + response.raise_for_status() + except requests.exceptions.HTTPError as err: + if response.status_code == 401: + return "Error: Invalid bearer token" + else: + return f"HTTP error occurred: {err}" + + if response.status_code == 200: + choices = response.json()['choices'][0] + logprobs = choices['logprobs']['top_logprobs'][0] + probs = {key: round(100 * np.e ** value, 2) for key, value in logprobs.items()} + key_prob = probs['"'] + if self.class_max[0] < key_prob < self.class_max[len(self.class_max) - 1]: + val = max(i for i in self.class_max if i < key_prob) + class_label = self.possible_classes[self.class_max.index(val)] + elif self.class_max[0] > key_prob: + class_label = self.possible_classes[0] + else: + class_label = self.possible_classes[len(self.possible_classes) - 1] + top_prob = {'Class': class_label, 'AI-Generated Probability': key_prob} + if all_probs: + return probs, top_prob + return top_prob + return "Check prompt, Length of sentence it should be more than 1,000 characters" + +class AIDetector: + def __init__(self, token_path='openai_bearer.txt'): + with open(token_path) as file: + self.bearer_token = file.readline().strip() + self.od = ContentGeneratorChecker(self.bearer_token) + + def human_or_ai(self, score): + if score >= 90: + human_prob = 100 - score + return score, human_prob, f"{score:.2f}% of the Text generated by AI." + elif score >= 70: + human_prob = 100 - score + return score, human_prob, f"{score:.2f}% of the Text generated by AI." + elif 60 > score >= 50: + human_prob = 100 - score + return score, human_prob, "AI text written by Human or Human written text improved by AI." + elif score >= 30: + human_prob = 100 - score + return score, human_prob, f"{human_prob:.2f}% Text written by human." + else: + human_prob = 100 - score + return score, human_prob, f"{human_prob:.2f}% Text written by human." + + def detect(self, text, split_type='sentence'): + if split_type == 'sentence': + chunks = text.split(". ") + elif split_type == 'paragraph': + chunks = re.split('\n+', text) + else: + return "Invalid split_type. Choose 'sentence' or 'paragraph'." + + chunk_list = [] + for chunk in chunks: + ai_response = self.od.detect(chunk) + if isinstance(ai_response, str): + return ai_response + else: + ai_score, human_score, human_or_ai_confidence = self.human_or_ai(ai_response['AI-Generated Probability']) + chunk_list.append([chunk, ai_score, human_score, human_or_ai_confidence]) + + df = pd.DataFrame(chunk_list, columns=['Chunk', 'AI Score', 'Human Score', 'Confidence']) + return df diff --git a/build/lib/TypeTruth/plotting.py b/build/lib/TypeTruth/plotting.py new file mode 100644 index 0000000..3b28a74 --- /dev/null +++ b/build/lib/TypeTruth/plotting.py @@ -0,0 +1,52 @@ +import seaborn as sns +import matplotlib.pyplot as plt +import matplotlib.colors as mcolors + +def plot_bar_chart(df): + # Create a list of paragraph labels + para_labels = [f"Paragraph {i+1}" for i in range(len(df))] + + # Plotting + fig, ax = plt.subplots(figsize=(10, 3)) + bars = plt.bar(para_labels, df['AI Generated Probability'], color='skyblue') + ax.bar_label(bars) + + plt.xlabel('Paragraphs') + plt.ylabel('AI Generated Probability (%)') + plt.title('AI Generated Probability Across Paragraphs') + plt.xticks(rotation=45) + plt.tight_layout() + plt.show() + +def plot_stacked_bar_chart(df): + # Create a list of paragraph labels + para_labels = [f"Paragraph {i+1}" for i in range(len(df))] + + # Plotting + fig, ax = plt.subplots(figsize=(10, 3.5)) + + bar1 = plt.bar(para_labels, df['AI Generated Probability'], color='skyblue') + bar2 = plt.bar(para_labels, df['Human Written Probability'], bottom=df['AI Generated Probability'], color='salmon') + + ax.bar_label(bar1, label_type='center', color='black') + ax.bar_label(bar2, label_type='center', color='black') + + plt.xlabel('Paragraphs') + plt.ylabel('Probability (%)') + plt.title('AI Generated and Human Written Probability Across Paragraphs') + plt.xticks(rotation=45) + plt.legend([bar1, bar2], ['AI Generated', 'Human Written'], bbox_to_anchor=(1.05, 1)) + plt.tight_layout() + plt.show() + +def plot_heatmap(df): + # Replace full sentences with first two words followed by '...' + df['Content'] = df['Content'].apply(lambda x: ' '.join(x.split()[:2]) + '...') + + # Transpose the dataframe to form a matrix for heatmap + heatmap_data = df[['Content', 'AI Generated Probability', 'Human Written Probability']].set_index('Content').transpose() + + plt.figure(figsize=(15, 3)) + sns.heatmap(heatmap_data, annot=True, cmap="YlGnBu") + plt.title('Heatmap of AI Generated and Human Written Probability') + plt.show() diff --git a/setup.py b/setup.py index 4aa5444..6b46f3c 100644 --- a/setup.py +++ b/setup.py @@ -1,29 +1,32 @@ -from setuptools import setup, find_packages - -setup( - name='TypeTruth', - version='0.1.0', - author='Bhaskar Tripathi', - author_email='bhaskar.tripathi@gmail.com', - long_description=open('README.md').read(), - description='TypeTruth is a Python library that detects whether a text is written by a human or AI. Ideal for fact-checking and content validation in the age of AI content generators. Detect content generated by chatGPT, GPT4, GPT3.5, GPT3, Falcon, Vicuna, Stanford Alpaca, and LlAMA', - long_description_content_type="text/markdown", - url='https://github.com/bhaskatripathi/TypeTruth', - packages=find_packages(), - install_requires=[ - 'requests', - 'numpy', - 'pandas', - 'matplotlib', - 'seaborn', - ], - classifiers=[ - 'Development Status :: 3 - Alpha', - 'Intended Audience :: Developers', - 'License :: OSI Approved :: MIT License', - 'Programming Language :: Python :: 3.7', - 'Programming Language :: Python :: 3.8', - 'Programming Language :: Python :: 3.9', - ], - python_requires='>=3.7', -) +from setuptools import setup, find_packages + +setup( + name='TypeTruth', + version='0.1.0', + author='Bhaskar Tripathi', + author_email='bhaskar.tripathi@gmail.com', + long_description=open('README.md').read(), + description='TypeTruth is a Python library that detects whether a text is written by a human or AI. Ideal for fact-checking and content validation in the age of AI content generators. Detect content generated by chatGPT, GPT4, GPT3.5, GPT3, Falcon, Vicuna, Stanford Alpaca, and LlAMA', + long_description_content_type="text/markdown", + url='https://github.com/bhaskatripathi/TypeTruth', + packages=find_packages(), + install_requires=[ + 'requests', + 'numpy', + 'pandas', + 'matplotlib', + 'seaborn', + 'torch', + 'scikit-learn', + 'transformers', + ], + classifiers=[ + 'Development Status :: 3 - Alpha', + 'Intended Audience :: Developers', + 'License :: OSI Approved :: MIT License', + 'Programming Language :: Python :: 3.7', + 'Programming Language :: Python :: 3.8', + 'Programming Language :: Python :: 3.9', + ], + python_requires='>=3.7', +)