def preprocess_data(file): parsed_data = [] for line in file.readlines(): parsed_line = {} parts = line.split(",") parsed_line["publication_date"] = parts[0] parsed_line["title"] = ",".join(parts[1:]) parsed_data.append(parsed_line) return parsed_data processed_data = [] with open("./examiner-date-text.csv", "r") as data: processed_data = preprocess_data(data) date_counts = {} for item in processed_data[1:]: if item["publication_date"] in date_counts.keys(): date_counts[item["publication_date"]] += 1 else: date_counts[item["publication_date"]] = 1 day_count = len(date_counts.keys()) articles_count = len(processed_data) average_article_count = articles_count / day_count for date in date_counts.keys(): print(date + ": " + str(date_counts[date]) + " articles") print(average_article_count) word_counts = {} for item in processed_data[1:]: words = item["title"].split(" ") for word in words: if word.lower() in word_counts.keys(): word_counts[word.lower()] += 1 else: word_counts[word.lower()] = 1 sorted_counts = sorted(word_counts.items(), key=lambda item: item[1], reverse=True) for i in range(100): print(str(sorted_counts[i][0]) + " freq: " + str(sorted_counts[i][1]))