diff --git a/webcrawlanalyse.py b/webcrawlanalyse.py new file mode 100644 index 0000000..0f434fa --- /dev/null +++ b/webcrawlanalyse.py @@ -0,0 +1,181 @@ +import os +import json +from collections import Counter +from urllib.parse import urlparse +import matplotlib.pyplot as plt + + +def load_json_file(filepath): + """ + Load and parse a JSON file. + """ + with open(filepath, "r") as file: + try: + data = json.load(file) + return data + except json.JSONDecodeError: + print(f"Error parsing the file {filepath}.") + return [] + + +def merge_json_files(file1, file2, output_file): + """ + Merge two JSON files into one. + """ + all_data = [] + + # Open the first JSON file and load its content + with open(file1, "r") as f1: + try: + data1 = json.load(f1) + all_data.extend(data1) # Add data from the first file to the list + except json.JSONDecodeError: + print(f"Error decoding JSON in file: {file1}") + + # Open the second JSON file and load its content + with open(file2, "r") as f2: + try: + data2 = json.load(f2) + all_data.extend(data2) # Add data from the second file to the list + except json.JSONDecodeError: + print(f"Error decoding JSON in file: {file2}") + + # Merge the data and write it to the output file + with open(output_file, "w") as output: + json.dump(all_data, output, indent=4) + + print(f"All data has been merged into {output_file}.") + + +def analyze_data(data): + """ + Perform analysis on the crawled data. + """ + total_pages = len(data) + total_links = sum(page.get("linksExtracted", 0) for page in data) # Adjusted based on your data structure + total_time_ms = sum(page.get("timeTakenMs", 0) for page in data) # Total time in milliseconds + depths = [page.get("depth", 0) for page in data] + depth_distribution = Counter(depths) + + domains = [urlparse(page.get("url", "")).netloc for page in data] + domain_counts = Counter(domains).most_common(10) + + # Calculate time taken in seconds + time_taken_seconds = total_time_ms / 1000.0 + + # Calculate unique sites crawled + unique_sites = set(domains) + num_sites_crawled = len(unique_sites) + + # Calculate total time taken by each domain (for graphical representation) + domain_times = {} + for page in data: + domain = urlparse(page.get("url", "")).netloc + domain_times[domain] = domain_times.get(domain, 0) + page.get("timeTakenMs", 0) + + return { + "total_pages": total_pages, + "total_links": total_links, + "depth_distribution": depth_distribution, + "top_domains": domain_counts, + "time_taken_seconds": time_taken_seconds, + "num_sites_crawled": num_sites_crawled, + "domain_times": domain_times + } + + +def visualize_data(analysis_results): + """ + Visualize the analysis results using matplotlib. + """ + # Create a main folder for storing images if it doesn't exist + output_dir = "analysis_images" + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + # Bar Chart: Depth Distribution + depths, counts = zip(*analysis_results["depth_distribution"].items()) + plt.figure(figsize=(10, 6)) + plt.bar(depths, counts, color='skyblue') + plt.xlabel("Depth Level") + plt.ylabel("Number of Pages") + plt.title("Depth Distribution of Crawled Pages") + plt.xticks(depths) + depth_chart_path = os.path.join(output_dir, "depth_distribution.png") + plt.savefig(depth_chart_path) + plt.show() + + # Pie Chart: Top Domains + labels, counts = zip(*analysis_results["top_domains"]) + plt.figure(figsize=(8, 8)) + plt.pie(counts, labels=labels, autopct='%1.1f%%', startangle=140, colors=plt.cm.Paired.colors) + plt.title("Top 10 Domains Crawled") + top_domains_chart_path = os.path.join(output_dir, "top_domains.png") + plt.savefig(top_domains_chart_path) + plt.show() + + # Pie Chart: Unique Sites Crawled + unique_sites = analysis_results["domain_times"] + unique_sites_labels = list(unique_sites.keys()) + unique_sites_counts = list(unique_sites.values()) + plt.figure(figsize=(8, 8)) + plt.pie(unique_sites_counts, labels=unique_sites_labels, autopct='%1.1f%%', startangle=140, colors=plt.cm.Paired.colors) + plt.title("Proportion of Pages Crawled by Domain") + unique_sites_chart_path = os.path.join(output_dir, "unique_sites.png") + plt.savefig(unique_sites_chart_path) + plt.show() + + # Bar Chart: Total Time Taken per Domain + domains, times = zip(*analysis_results["domain_times"].items()) + plt.figure(figsize=(12, 8)) + plt.bar(domains, times, color='lightgreen') + plt.xlabel("Domain") + plt.ylabel("Total Time Taken (ms)") + plt.title("Total Time Taken per Domain") + plt.xticks(rotation=90) + time_per_domain_chart_path = os.path.join(output_dir, "time_per_domain.png") + plt.savefig(time_per_domain_chart_path) + plt.show() + + # Notify user where the files are saved + print(f"Visualizations saved in '{output_dir}'.") + + +def main(): + """ + Main function to load, merge, analyze, and visualize crawled data. + """ + # Path to the folder containing thread JSON files + file1 = "crawl_data_1.json" + file2 = "crawl_data_2.json" + output_file = "crawl_data.json" # Path to the merged JSON file + + # Merge JSON files from multiple threads into one + print("Merging JSON files...") + merge_json_files(file1, file2, output_file) + + # Load Merged Data + print("Loading data from merged JSON file...") + data = load_json_file(output_file) + if not data: + print("No data found in the file. Exiting...") + return + print(f"Loaded {len(data)} pages from {output_file}.") + + # Analyze Data + print("Analyzing data...") + analysis_results = analyze_data(data) + print(f"Total Pages Crawled: {analysis_results['total_pages']}") + print(f"Total Links Found: {analysis_results['total_links']}") + print(f"Time Taken for Crawling: {analysis_results['time_taken_seconds']:.2f} seconds") + print(f"Number of Unique Sites Crawled: {analysis_results['num_sites_crawled']}") + print(f"Depth Distribution: {dict(analysis_results['depth_distribution'])}") + print(f"Top 10 Domains: {analysis_results['top_domains']}") + + # Step 4: Visualize Data + print("Visualizing data...") + visualize_data(analysis_results) + + +if __name__ == "__main__": + main() diff --git a/webcrawlanalyser.c b/webcrawlanalyser.c new file mode 100644 index 0000000..0f76110 --- /dev/null +++ b/webcrawlanalyser.c @@ -0,0 +1,219 @@ +#include +#include +#include +#include "cJSON.h" + +void load_and_merge_json(const char *files[], int num_files, const char *output_file) { + cJSON *merged_data = cJSON_CreateArray(); + + for (int i = 0; i < num_files; i++) { + FILE *file = fopen(files[i], "r"); + if (!file) { + printf("Error opening file: %s\n", files[i]); + continue; + } + + fseek(file, 0, SEEK_END); + long file_size = ftell(file); + fseek(file, 0, SEEK_SET); + + char *file_content = malloc(file_size + 1); + fread(file_content, 1, file_size, file); + fclose(file); + + cJSON *json_data = cJSON_Parse(file_content); + free(file_content); + + if (json_data) { + for (int j = 0; j < cJSON_GetArraySize(json_data); j++) { + cJSON *item = cJSON_GetArrayItem(json_data, j); + cJSON_AddItemToArray(merged_data, cJSON_Duplicate(item, 1)); + } + cJSON_Delete(json_data); + } + } + + FILE *output = fopen(output_file, "w"); + if (output) { + char *merged_json = cJSON_Print(merged_data); + fprintf(output, "%s", merged_json); + free(merged_json); + fclose(output); + } + + cJSON_Delete(merged_data); +} + +char* extract_domain_from_url(const char* url) { + const char *http_prefix = "http://"; + const char *https_prefix = "https://"; + const char *domain_start = NULL; + + if (strncmp(url, http_prefix, strlen(http_prefix)) == 0) { + domain_start = url + strlen(http_prefix); + } else if (strncmp(url, https_prefix, strlen(https_prefix)) == 0) { + domain_start = url + strlen(https_prefix); + } else { + domain_start = url; + } + + const char *domain_end = strchr(domain_start, '/'); + + if (domain_end == NULL) { + domain_end = domain_start + strlen(domain_start); + } + + size_t domain_len = domain_end - domain_start; + char *domain = malloc(domain_len + 1); + if (domain) { + strncpy(domain, domain_start, domain_len); + domain[domain_len] = '\0'; // Null-terminate the string + } + + return domain; +} + +void analyze_data(cJSON *data, int *depths, int *times, int *site_counts, FILE *csv_file) { + int total_pages = 0; + int total_links = 0; + int total_time_ms = 0; + + cJSON *domains = cJSON_CreateObject(); + + cJSON *item = NULL; + cJSON_ArrayForEach(item, data) { + total_pages++; + total_links += cJSON_GetObjectItem(item, "linksExtracted")->valueint; + total_time_ms += cJSON_GetObjectItem(item, "timeTakenMs")->valueint; + + int depth = cJSON_GetObjectItem(item, "depth")->valueint; + depths[depth]++; + + int time_taken = cJSON_GetObjectItem(item, "timeTakenMs")->valueint; + times[depth] += time_taken; + + site_counts[depth]++; + + char *url = cJSON_GetObjectItem(item, "url")->valuestring; + char *domain = extract_domain_from_url(url); + cJSON *domain_count = cJSON_GetObjectItem(domains, domain); + if (domain_count) { + domain_count->valueint++; + } else { + cJSON_AddNumberToObject(domains, domain, 1); + } + } + + // Output data to CSV + fprintf(csv_file, "Total Pages Crawled, Total Links Found, Total Time Taken (ms)\n"); + fprintf(csv_file, "%d, %d, %d\n", total_pages, total_links, total_time_ms); + + fprintf(csv_file, "Depth Distribution:\n"); + for (int i = 0; i < 100; i++) { + if (depths[i] > 0) { + fprintf(csv_file, "Depth %d, %d pages\n", i, depths[i]); + } + } + + fprintf(csv_file, "Domain Counts:\n"); + cJSON *domain_item = NULL; + cJSON_ArrayForEach(domain_item, domains) { + fprintf(csv_file, "%s, %d\n", domain_item->string, domain_item->valueint); + } + + // printf("Total Pages Crawled: %d\n", total_pages); + // printf("Total Links Found: %d\n", total_links); + // printf("Total Time Taken: %d ms\n", total_time_ms); + + // printf("Depth Distribution:\n"); + // for (int i = 0; i < 100; i++) { + // if (depths[i] > 0) { + // printf("Depth %d: %d pages\n", i, depths[i]); + // } + // } + + // printf("Domain Counts:\n"); + // cJSON_ArrayForEach(domain_item, domains) { + // printf("%s: %d\n", domain_item->string, domain_item->valueint); + // } + + cJSON_Delete(domains); +} + +void generate_depth_distribution_plot(int depths[]) { + FILE *data_file = fopen("plots/depth_distribution.dat", "w"); + for (int i = 0; i < 100; i++) { + if (depths[i] > 0) { + fprintf(data_file, "%d %d\n", i, depths[i]); + } + } + fclose(data_file); + + system("gnuplot -e \"set terminal png; set output 'plots/depth_distribution.png'; plot 'plots/depth_distribution.dat' using 1:2 with boxes\""); +} + +void generate_time_per_thread_plot(int times[], int num_threads) { + FILE *data_file = fopen("plots/time_per_thread.dat", "w"); + for (int i = 0; i < num_threads; i++) { + fprintf(data_file, "%d %d\n", i, times[i]); + } + fclose(data_file); + + system("gnuplot -e \"set terminal png; set output 'plots/time_per_thread.png'; plot 'plots/time_per_thread.dat' using 1:2 with linespoints\""); +} + +void generate_sites_per_thread_plot(int site_counts[], int num_threads) { + FILE *data_file = fopen("plots/sites_per_thread.dat", "w"); + for (int i = 0; i < num_threads; i++) { + fprintf(data_file, "%d %d\n", i, site_counts[i]); + } + fclose(data_file); + + system("gnuplot -e \"set terminal png; set output 'plots/sites_per_thread.png'; plot 'plots/sites_per_thread.dat' using 1:2 with linespoints\""); +} + +int main() { + // Create a folder for storing the plots + system("mkdir -p plots"); + + // Input and output files + const char *files[] = {"crawl_data_1.json", "crawl_data_2.json"}; + int num_files = 2; + load_and_merge_json(files, num_files, "merged_data.json"); + + // Read merged data + FILE *merged_file = fopen("merged_data.json", "r"); + fseek(merged_file, 0, SEEK_END); + long file_size = ftell(merged_file); + fseek(merged_file, 0, SEEK_SET); + + char *file_content = malloc(file_size + 1); + fread(file_content, 1, file_size, merged_file); + fclose(merged_file); + + cJSON *data = cJSON_Parse(file_content); + free(file_content); + + if (data) { + int depths[100] = {0}; + int times[100] = {0}; + int site_counts[100] = {0}; + + // Open CSV file for output + FILE *csv_file = fopen("analysis_output.csv", "w"); + if (csv_file) { + analyze_data(data, depths, times, site_counts, csv_file); + fclose(csv_file); + } + + // Generate plots + generate_depth_distribution_plot(depths); + generate_time_per_thread_plot(times, 100); // assuming 100 threads + generate_sites_per_thread_plot(site_counts, 100); // assuming 100 threads + + cJSON_Delete(data); + } + printf("Your data has been saved in 'analysis_output.csv' and graphs are in the 'plot' folder.\n"); + + return 0; +}