Labeller

In Progress

Solution

import numpy

from collections import Counter
from nltk import word_tokenize
from nltk.corpus.reader.wordnet import NOUN
from nltk.stem import WordNetLemmatizer

class Question:
    def __init__(self):
        self.category_count = None  # used to store the top topics
        self.categories = []
        self.map = {}
        self.tf = {}
        self.context_matrix = {}

    def insert_words_and_categories(self, words, categories):
        """
        Args:
            words: set of words that are already cleaned.
        Returns:
            None
        """
        self.categories.extend(categories)
        for word in words:
            if word in self.map:
                self.map[word].extend(categories)
                self.map[word] = list(set(self.map[word]))
            else:
                self.map[word] = list(categories)
        self.update_context_matrix(categories)

    def build_top_topics(self):
        """Builds the counter for top topics."""
        self.category_count = Counter(self.categories)

    def update_context_matrix(self, categories):
        """
        Update the co-occurance matrix for all the words.
        """
        for i in xrange(len(categories)):
            if categories[i] in self.context_matrix:
                self.context_matrix[categories[i]].update(categories[i+1:])
                self.context_matrix[categories[i]].update(categories[i-1:0:-1])
            else:
                self.context_matrix[categories[i]] = Counter(categories[i+1:])
                self.context_matrix[categories[i]].update(categories[i-1:0:-1])


class Labeler:
    def __init__(self):
        self.stop_words = ['all', 'just', 'being', 'over', 'both', 'through',
                           'its', 'before', 'herself', 'had', 'should', 'them',
                           'only', 'won', 'under', 'ours', 'has', 'then', 'to',
                           'his', 'very', 'they', 'not', 'during', 'him',
                           'nor', 'd', 'did', 'these', 'she', 'each', 'now',
                           'further', 'where', 'because', 'doing', 'are',
                           'our', 'ourselves', 'out', 'what', 'for', 'below',
                           'does', 'above', 'some', 'few', 'between', 'be',
                           'we', 'after', 'here', 'by', 'on', 'about', 'of',
                           'against', 's', 'or', 'own', 'into', 'yourself',
                           'down', 'your', 'from', 'her', 'whom', 'there',
                           'been', 'their', 'too', 'themselves', 'was',
                           'until', 'more', 'himself', 'that', 'but', 'with',
                           'than', 'those', 'he', 'me', 'myself', 'this', 'up',
                           'will', 'while', 'can', 'were', 'my', 'and', 'do',
                           'is', 'am', 'it', 'an', 'as', 'itself', 'at',
                           'have', 'in', 'any', 'if', 'again', 'no', 'when',
                           'same', 'how', 'other', 'which', 'you', 'who',
                           'most', 'such', 'why', 'a', 'off', 'i', 'm', 'so',
                           'y', 'the', 'having', 'once', 'yours', "'s", '?',
                           'i', 'me', 'my', 'myself', 'we', 'our', 'ours',
                           'ourselves', 'yo', 'your', 'yours', 'yourself',
                           'yourselves', 'he', 'him', 'his', 'himself', 'she',
                           'her', 'hers', 'herself', 'it', 'its', 'itself',
                           'they', 'them', 'their', 'theirs', 'themselves',
                           'what', 'which', 'who', 'whom', 'this', 'that',
                           'these', 'those', 'am', 'is', 'are', 'was', 'were',
                           'be', 'been', 'have', 'has', 'had', 'having', 'do',
                           'does', 'did', 'doing', 'a', 'an', 'the', 'and',
                           'but', 'if', 'or', 'because', 'as', 'until',
                           'of', 'at', 'by', 'for', 'with', 'about', 'against',
                           'between', 'into', 'through', 'during', 'before',
                           'after', 'above', 'below', 'to', 'from', 'up',
                           'down', 'in', 'out', 'on', 'off', 'under',
                           'again', 'further', 'then', 'once', 'here', 'there',
                           'when', 'where', 'why', 'how', 'any', 'both',
                           'each', 'few', 'more', 'most', 'other', 'some',
                           'such', 'no', 'nor', 'not', 'only', 'own', 'same',
                           'so', 'than', 'too', 'very', 's', 't', 'can',
                           'will', 'don', 'should', 'now', 'while', 'How', 'In']
        self.q = Question()
        self.wnl = WordNetLemmatizer()

    def tokenize_and_build(self, sentence, categories):
        """
        Tokenize the sentence and build the resultant data structure.

        Args:
            sentence: string representing the sentence to enter
        Returns:
            None
        Raises:
            None
        """
        tokens = word_tokenize(sentence)
        words = [self.wnl.lemmatize(word, NOUN) for word in tokens if word not in self.stop_words]
        self.q.insert_words_and_categories(words, categories)
        self.q.update_context_matrix(categories)

    def query_new_questions(self, sentence):
        """
        """
        tokens = word_tokenize(sentence)
        words = [self.wnl.lemmatize(word, NOUN) for word in tokens if word not in self.stop_words]
        categories = []
        for word in words:
            if word in self.q.map:
                if not categories:
                    categories.extend(self.q.map[word])
                else:
                    categories.extend(self.q.map[word])
        answer = []
        for k, v in Counter(categories).most_common(10):
            answer.append(k)
        return " ".join(str(x) for x in answer)


def main():
    l = Labeler()
    with open("labeler_sample.in", "r") as f:
        content = f.readlines()
    n, m = map(int, content[0].split(" "))
    for j in range(1, 2 * n, 2*n+m):
        categories = map(int, content[j].split(" "))
        sentence = content[j+1].strip()
        l.tokenize_and_build(sentence, categories)
    l.q.build_top_topics()
    results = []

    for i in range(2*n, 2*n+1):
        results.append(l.query_new_questions(content[i].strip()))
    with open("my_answers.txt", "wb") as f:
        f.write("\n".join(results))

if __name__ == '__main__':
    main()

results matching ""

    No results matching ""