-
Notifications
You must be signed in to change notification settings - Fork 0
/
tip2sentence.py
122 lines (107 loc) · 4.4 KB
/
tip2sentence.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# -*- coding: utf-8 -*-
"""
Created on Fri May 8 23:49:32 2015
@author: robert
"""
from pymongo import MongoClient
#import pandas as pd
import re
from nltk.corpus import stopwords
# Download the punkt tokenizer for sentence splitting
import nltk.data
#nltk.download()
# Load the punkt tokenizer
pickle = nltk.data.load('tokenizers/punkt/english.pickle')
def connect_mongo_collection(server='localhost',port=27017):
client = MongoClient(server,port)
return client
def load_reviews(indb,outdb,tokenizer,skip=0,limit=0):
sentences = []
bulk = outdb.initialize_unordered_bulk_op()
for rev in indb.find({},{'_id':'0','text':'1'}).limit(limit).skip(skip):
sentences = review_to_sentences(rev['text'],tokenizer)
for s in sentences:
bulk.insert({'text':s})
return bulk.execute()
def review_to_words( review, remove_stopwords=False ):
# Function to convert a document to a sequence of words,
# optionally removing stop words. Returns a list of words.
# 1. Remove non-letters
review_text = re.sub("[^a-zA-Z]"," ", review)
#
# 2. Convert words to lower case and split them
words = review_text.lower().split()
#
# 3. Optionally remove stop words (false by default)
if remove_stopwords:
stops = set(stopwords.words("english"))
words = [w for w in words if not w in stops]
#
# 4. Return a list of words
return(words)
def review_to_sentences( review, tokenizer, remove_stopwords=False ):
# Function to split a review into parsed sentences. Returns a
# list of sentences, where each sentence is a list of words
#
# 1. Use the NLTK tokenizer to split the paragraph into sentences
raw_sentences = tokenizer.tokenize(review.strip())
#
# 2. Loop over each sentence
sentences = []
for raw_sentence in raw_sentences:
# If a sentence is empty, skip it
if len(raw_sentence) > 0:
# Otherwise, call review_to_wordlist to get a list of words
sentences.append( review_to_words( raw_sentence, \
remove_stopwords ))
#
# Return the list of sentences (each sentence is a list of words,
# so this returns a list of lists
return sentences
# Import the built-in logging module and configure it so that Word2Vec
# creates nice output messages
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
level=logging.INFO)
db = connect_mongo_collection()
#db.yelp.create_collection("sentence")
batchsize=10000
totalsize=db.yelp.tip.count()
nbatches = totalsize/batchsize
finalsize=totalsize%batchsize
print "Loading %d Reviews in %d batches of %d plus a final batch of %d..." % (totalsize,nbatches,batchsize,finalsize)
count=1
total=0
for n in range(0,totalsize,batchsize):
print "%.2f%% (%d/%d) Read %d reviews and tokenized ..." % (100*n/totalsize,count,nbatches,batchsize),
count+=1
result=load_reviews(db.yelp.tip,db.yelp.sentence,pickle,skip=n,limit=batchsize)
#print (result)
print "%d sentences inserted..." % result['nInserted']
total+=result['nInserted']
print "Read final %d reviews and tokenized ..." % finalsize,
result = load_reviews(db.yelp.tip,db.yelp.sentence,pickle,skip=totalsize-finalsize)
print "%d sentences inserted..." % result['nInserted']
print "Done. %d total sentences." % total
#raw_input("Press Enter to continue...")
"""
# Set values for various parameters
num_features = 300 # Word vector dimensionality
min_word_count = 40 # Minimum word count
num_workers = 4 # Number of threads to run in parallel
context = 10 # Context window size
downsampling = 1e-3 # Downsample setting for frequent words
# Initialize and train the model (this will take some time)
from gensim.models import word2vec
print "Training model..."
model = word2vec.Word2Vec(sentences, workers=num_workers, \
size=num_features, min_count = min_word_count, \
window = context, sample = downsampling)
# If you don't plan to train the model any further, calling
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)
# It can be helpful to create a meaningful model name and
# save the model for later use. You can load it later using Word2Vec.load()
model_name = "yelpreviewtest"
model.save(model_name)
"""