import matplotlib.pyplot as plt ''' x1 = [1, 2, 3] y1 = [10, 5, 7] y2 = [4,8,3,7,5 ] x2= [1, 2, 3, 4, 5] x3 = [2, 4, 5, 6, 7] y3 = [1,5,6,9, 8] plt.title("First diagram") plt.xlabel("X axis") plt.ylabel("Y axis") #plt.xlim(1,8) plt.ylim(1, 10) #plt.plot(x1, y1, label="Line 1", color="red", linestyle="dashed", linewidth=3, marker="o", markerfacecolor="blue", markersize=20) #plt.plot(x2, y2, label="Line 2") #plt.scatter(x3, y3, label="Random points") #plt.scatter(x1, y1, label="Random points 2") labels = ["Apples", "Bannanas", "Oranges"] #plt.pie(y1, labels=labels, explode = (0.1,0.2,0.4)) #plt.legend() ages = [20, 35, 35, 98, 31, 23, 23, 76, 45, 12, 10, 30, 24, 57, 90, 78, 51, 34, 47] plt.hist(ages, 10) plt.show() ''' def preprocess_data(file): parsed_data = [] for line in file.readlines(): parsed_line = {} parts = line.split(",") parsed_line["publication_date"] = parts[0] parsed_line["title"] = ",".join(parts[1:]) parsed_data.append(parsed_line) return parsed_data processed_data = [] with open("./examiner-date-text.csv", "r") as data: processed_data = preprocess_data(data) date_counts = {} for item in processed_data[1:]: if item["publication_date"] in date_counts.keys(): date_counts[item["publication_date"]] += 1 else: date_counts[item["publication_date"]] = 1 sorted_date_counts = sorted(date_counts.items(), key=lambda item: item[0]) frequencies = list(map(lambda item: item[1], sorted_date_counts)) print(frequencies) plt.hist(frequencies, 20) plt.xlabel("time") plt.ylabel("number of articles") plt.show() #day_count = len(date_counts.keys()) #articles_count = len(processed_data) #average_article_count = articles_count / day_count '''for date in date_counts.keys(): print(date + ": " + str(date_counts[date]) + " articles") print(average_article_count) word_counts = {} for item in processed_data[1:]: words = item["title"].split(" ") for word in words: if word.lower() in word_counts.keys(): word_counts[word.lower()] += 1 else: word_counts[word.lower()] = 1 sorted_counts = sorted(word_counts.items(), key=lambda item: item[1], reverse=True) for i in range(100): print(str(sorted_counts[i][0]) + " freq: " + str(sorted_counts[i][1]))'''