parser.py


'''

Sentence Parser

Using context-free grammar formalism to parse English sentences to 
determine their structure to help computer to better understand the 
meaning of the sentence and can be utilized for various purpose such
as sentimental analysis, reviews analysis, etc.


'''

# Importing Libraries

import nltk
import sys
import re

# nltk.download('punkt')


# Defining terminals involved in the Context-Free Grammar

TERMINALS = """
Adj -> "country" | "dreadful" | "enigmatical" | "little" | "moist" | "red"
Adv -> "down" | "here" | "never"
Conj -> "and"
Det -> "a" | "an" | "his" | "my" | "the"
N -> "armchair" | "companion" | "day" | "door" | "hand" | "he" | "himself"
N -> "holmes" | "home" | "i" | "mess" | "paint" | "palm" | "pipe" | "she"
N -> "smile" | "thursday" | "walk" | "we" | "word"
P -> "at" | "before" | "in" | "of" | "on" | "to" | "until"
V -> "arrived" | "came" | "chuckled" | "had" | "lit" | "said" | "sat"
V -> "smiled" | "tell" | "were"
"""

# Defining non-terminals involved in the Context-Free Grammar

NONTERMINALS = """
S -> NP VP | S Conj S | NP VP Conj VP
AP -> Adj | Adj AP
NP -> N | Det NP | AP NP | NP PP
PP -> P NP | P S
VP -> V | V NP | V NP PP | V PP | VP Adv | Adv VP
"""

# Forming the grammer

grammar = nltk.CFG.fromstring(NONTERMINALS + TERMINALS)
parser = nltk.ChartParser(grammar)


def main():

    # If filename specified, read sentence from file
    
    if len(sys.argv) == 2:
        with open(sys.argv[1]) as f:
            s = f.read()

    # Otherwise, get sentence as input
    
    else:
        s = input("Sentence: ")

    # Convert input into list of words
    
    s = preprocess(s)

    # Attempt to parse sentence
    
    try:
        trees = list(parser.parse(s))
        
    except ValueError as e:
        print(e)
        return
        
    if not trees:
        print("Could not parse sentence.")
        return

    # Print each tree with noun phrase chunks
    
    for tree in trees:
        tree.pretty_print()

        print("Noun Phrase Chunks")
        for np in np_chunk(tree):
            print(" ".join(np.flatten()))


def preprocess(sentence):
    
    """
    Convert `sentence` to a list of its words.
    Pre-process sentence by converting all characters to lowercase
    and removing any word that does not contain at least one alphabetic
    character.
    
    """
    
    sentence = sentence.lower()
    words = nltk.word_tokenize(sentence)
    return [word for word in words if re.match('[a-z]', word)]


def np_chunk(tree):
    
    """
    Return a list of all noun phrase chunks in the sentence tree.
    A noun phrase chunk is defined as any subtree of the sentence
    whose label is "NP" that does not itself contain any other
    noun phrases as subtrees.
    
    """
    
    return [subtree for subtree in tree.subtrees(is_np_chunk)]


def is_np_chunk(tree):

    """
    
    Returns true if given tree is a NP chunk.
    A noun phrase chunk is defined as any subtree of the sentence
    whose label is "NP" that does not itself contain any other
    noun phrases as subtrees.
    
    """
    
    if tree.label() == 'NP' and \
            not list(tree.subtrees(lambda t: t.label() == 'NP' and t != tree)):
        return True
    else:
        return False


if __name__ == "__main__":
    main()