from nltk.tokenize import word_tokenize from nltk.corpus import stopwords import nltk from nltk.stem import WordNetLemmatizer import re def preprocess_data(file): parsed_data = [] for line in file.readlines(): parsed_line = {} parts = line.split(",") parsed_line["publication_date"] = parts[0] parsed_line["title"] = ",".join(parts[1:]) parsed_data.append(parsed_line) return parsed_data def is_autosemantic(tag): return re.match("^(JJ|NN|VB)", tag) processed_data = [] with open("./examiner-date-text.csv", "r") as data: processed_data = preprocess_data(data) word_counts = {} stop_words = set(stopwords.words("english")) lemmatizer = WordNetLemmatizer() #print(stop_words) for item in processed_data[1:100000]: words = word_tokenize(item["title"]) tagged_words = nltk.pos_tag(words) for wordtag in tagged_words: #print(wordtag) lemma = lemmatizer.lemmatize(wordtag[0].casefold()) #print(lemma) if lemma not in stop_words and is_autosemantic(wordtag[1]): if lemma in word_counts.keys(): word_counts[lemma] += 1 else: #print(lemma) word_counts[lemma] = 1 sorted_counts = sorted(word_counts.items(), key=lambda item: item[1], reverse=True) for i in range(100): print(str(sorted_counts[i][0]) + " freq: " + str(sorted_counts[i][1]))