# Load Packages
import pandas as pd
import numpy as np
import sqlite3
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import nltk
import warnings
# !pip install pandas_profiling
from pandas_profiling import ProfileReport
warnings.filterwarnings("ignore")


# Display options
#pd.set_option('display.max_colwidth', -1)


# Function removes punctuation and non-letter characters from text
def clean_text(t):
    import re
    
    t = t.lower()
    t = re.sub("@[A-Za-z0-9]+", "", t) # remove handles
    t = re.sub("#[A-Za-z0-9]+", "", t) # remove hashtags
    t = re.sub(r"http\S+", "", t)      # remove links (anything that doesn't have a space after http)
    t = re.sub(r"www.\S+", "", t)      # remove links
    t = re.sub("[()!?]", "", t)        # remove punctuation
    t = re.sub("'", "", t)             # remove apostrophes, replace with no space
    t = re.sub("\[.*?\]", "", t)       # remove puncutation
    t = re.sub("[^a-z]", " ", t)
    return t


# Function vectorizes specified text and returns most frequent ngrams in specified range
def get_ngrams(text, ngram_from = 1, ngram_to = 4, n = None, max_features = 20000):
    
    vec = CountVectorizer(ngram_range = (ngram_from, ngram_to),
                          max_features = max_features,
                          stop_words = "english").fit(text)
    bag_of_words = vec.transform(text)
    sum_words = bag_of_words.sum(axis = 0)
    words_freq = [(word, sum_words[0, i]) for word, i in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse = True)
    
    return words_freq[:n]


# Isolate nouns in a corpus
def nouns(text):
    import nltk
    from nltk.corpus import stopwords
    import string
    from nltk import word_tokenize, pos_tag
    
    #Given a string of text, tokenize the text and pull out only the nouns.
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)]
    return ' '.join(all_nouns)


# Topic modeling function - applies LDA algorithm to text with number of topics specified, returns interactive visualization
def model_topics(df, topics):

    # Instantiate vectorizer
    v = CountVectorizer(stop_words = sw,
                        min_df = 0.2,
                        max_df = 0.8)

    # Vectorize to create dtm
    dtm = v.fit_transform(df)    
    
    lda = LatentDirichletAllocation(n_components = topics,
                                    random_state = 987)
    topic_model = lda.fit_transform(dtm)
    lda_viz = pyLDAvis.sklearn.prepare(lda_model = lda,
                                       dtm = dtm,
                                       vectorizer = v)
    return pyLDAvis.display(lda_viz)


# Open Connection to Data Source
%reload_ext sql
%sql sqlite:///yelp_hotel.db


# Define connection to data set db
con = sqlite3.connect("yelp_hotel.db")

# Create pandas data frames from the three tables in the db
reviews = pd.read_sql_query("SELECT * from review;", con)
hotel = pd.read_sql_query("SELECT * from hotel;", con)
reviewer = pd.read_sql_query("SELECT * from reviewer;", con)

# Close the connection
con.close()


hotel.shape

(283086, 13)


# These are venue names in the hotel table that do not contain the word 'hotel'. Although many of these are still
# Actually hotels, many more are not. Scroll through and see Starbucks locations, banks, museums, grocery stores, etc.
list(hotel[-hotel["name"].str.contains("Hotel")]["name"])


# Number of venues actually categorized as hotels
hotel[hotel["categories"].str.contains("Hotels, ")]["categories"].count()

10097


# Subset hotel df before combining with reviews. We are removing average review, which we will calculate ourselves
hotel_subset = hotel.drop('rating', axis = 1)

# Join reviews and hotels dfs on hotelID keeping all reviews and assigning hotel info to them
reviews = pd.merge(reviews, hotel_subset, how = 'left', on = 'hotelID')

# Subset data to only reviews about hotels
reviews = reviews[reviews["categories"].str.contains("Hotels,") == True]


reviews.shape

(31723, 21)


# Percentage of reviews flagged in each category
reviews.groupby(["flagged"])["flagged"].count() / reviews["flagged"].count() * 100

flagged
N     16.007313
NR    55.536992
Y      2.458784
YR    25.996911
Name: flagged, dtype: float64


reviews = reviews.loc[reviews["flagged"].isin(["N", "NR"])]


# Reviewers df
reviews.groupby(["flagged"])["flagged"].count() / reviews["flagged"].count() * 100

flagged
N     22.373987
NR    77.626013
Name: flagged, dtype: float64


reviews.shape

(22696, 21)


# Cast text as category and clean text for all reviews
reviews["cleanText"] = reviews["reviewContent"].astype("category")
reviews["cleanText"] = reviews["cleanText"].apply(clean_text)


hotel_avg = reviews.groupby(["hotelID"]).agg(avg_rating = ("rating", "mean")).reset_index()


reviews2 = pd.merge(reviews, hotel_avg, how = 'left', on = 'hotelID')


# Create dfs for bad reviews (rating < 3) and good reviews (rating > 3)
good_hotels = reviews2[reviews2["avg_rating"] > 3]
bad_hotels = reviews2[reviews2["avg_rating"] < 3]


# Create dfs for good hotels good reviews and bad hotels bad reviews
good_hotels_rev = good_hotels[good_hotels["rating"] > 3]
bad_hotels_rev = bad_hotels[bad_hotels["rating"] < 3]


bad_hotels_rev.shape

(2227, 23)


# All reviews
good_text = " ".join(review for review in good_hotels_rev["cleanText"])
bad_text = " ".join(review for review in bad_hotels_rev["cleanText"])


# Define a word cloud from the pos text string
wordcloud_good = WordCloud(collocations = False, width = 1800, height = 800).generate(good_text)

# Plot the word cloud
plt.figure(figsize = (15, 10))
plt.imshow(wordcloud_good, interpolation = "bilinear")
plt.axis("off")
plt.show()


# Define a word cloud from the pos text string
wordcloud_bad = WordCloud(collocations = False, width = 1800, height = 800).generate(bad_text)

# Plot the word cloud
plt.figure(figsize = (15, 10))
plt.imshow(wordcloud_bad, interpolation = "bilinear")
plt.axis("off")
plt.show()


# Import and read in stopwords
from nltk.corpus import stopwords
sw = stopwords.words("english")

# Add additional stopwords to stopword list to use when vectorizing
sw.extend(["hotel"])


# Instantiate vectorizer
v = CountVectorizer(stop_words = sw,
                   min_df = 0.2,
                   max_df = 0.8)


model_topics(good_hotels_rev["cleanText"], 4)


model_topics(bad_hotels_rev["cleanText"], 4)


# Top 10 unigrams
good_grams = get_ngrams(good_hotels_rev["cleanText"], 1, 1, 20)
goodgrams_pr = pd.DataFrame(good_grams)
goodgrams_pr.columns=["Unigram", "U_Freq"]

# Top 10 bigrams
good_grams = get_ngrams(good_hotels_rev["cleanText"], 2, 2, 20)
df = pd.DataFrame(good_grams)
df.columns = ["Bigram", "Bi_Freq"]

goodgrams_pr = goodgrams_pr.join(df["Bigram"])
goodgrams_pr = goodgrams_pr.join(df["Bi_Freq"])

# Top 10 Trigrams
good_grams = get_ngrams(good_hotels_rev["cleanText"], 3, 3, 20)
df = pd.DataFrame(good_grams)
df.columns = ["Trigram", "Tri_Freq"]

goodgrams_pr = goodgrams_pr.join(df["Trigram"])
goodgrams_pr = goodgrams_pr.join(df["Tri_Freq"])

goodgrams_pr


# Top 10 unigrams
bad_grams = get_ngrams(bad_hotels_rev["cleanText"], 1, 1, 20)
badgrams_pr = pd.DataFrame(bad_grams)
badgrams_pr.columns=["Unigram", "U_Freq"]

# Top 10 bigrams
bad_grams = get_ngrams(bad_hotels_rev["cleanText"], 2, 2, 20)
df = pd.DataFrame(bad_grams)
df.columns = ["Bigram", "Bi_Freq"]

badgrams_pr = badgrams_pr.join(df["Bigram"])
badgrams_pr = badgrams_pr.join(df["Bi_Freq"])

# Top 10 Trigrams
bad_grams = get_ngrams(bad_hotels_rev["cleanText"], 3, 3, 20)
df = pd.DataFrame(bad_grams)
df.columns = ["Trigram", "Tri_Freq"]

badgrams_pr = badgrams_pr.join(df["Trigram"])
badgrams_pr = badgrams_pr.join(df["Tri_Freq"])

badgrams_pr

	Unigram	U_Freq	Bigram	Bi_Freq	Trigram	Tri_Freq
0	hotel	17107	room service	964	flat screen tv	337
1	room	16240	walking distance	940	free wi fi	330
2	great	9094	great location	844	king size bed	170
3	nice	8136	staff friendly	750	pool hot tub	166
4	stay	7329	place stay	658	staff friendly helpful	166
5	rooms	6656	flat screen	615	great place stay	148
6	place	6080	ive stayed	599	hotel great location	130
7	like	5600	free wifi	559	staff super friendly	108
8	good	5541	definitely stay	518	flat screen tvs	108
9	staff	5455	wi fi	517	room th floor	105
10	just	5302	th floor	505	king sized bed	104
11	service	4973	great place	494	free continental breakfast	85
12	free	4745	really nice	489	id definitely stay	82
13	night	4721	free breakfast	471	highly recommend hotel	82
14	really	4668	room clean	468	got great deal	78
15	location	4581	hotel room	464	hotels ive stayed	76
16	breakfast	4317	friendly helpful	459	ordered room service	72
17	clean	4054	continental breakfast	454	location location location	71
18	time	3991	living room	430	floor ceiling windows	68
19	area	3918	feel like	420	holiday inn express	67

	Unigram	U_Freq	Bigram	Bi_Freq	Trigram	Tri_Freq
0	room	3742	holiday inn	135	holiday inn express	41
1	hotel	2848	ive stayed	132	free wi fi	36
2	place	1609	looked like	111	non smoking room	36
3	like	1442	room service	108	flat screen tv	18
4	stay	1338	wi fi	101	hilton garden inn	18
5	just	1272	hotel room	100	congress plaza hotel	17
6	night	1142	parking lot	97	room smelled like	17
7	rooms	1139	felt like	84	pool hot tub	16
8	good	862	customer service	81	worst hotel stayed	13
9	didnt	846	dont know	80	room th floor	13
10	desk	838	got room	80	im pretty sure	12
11	really	822	didnt work	77	called desk ask	11
12	time	801	continental breakfast	75	non smoking rooms	11
13	nice	771	staff friendly	75	seen better days	11
14	did	749	place stay	75	hotel ive stayed	11
15	dont	725	non smoking	73	desk staff friendly	11
16	bed	724	free breakfast	72	free continental breakfast	11
17	got	692	called desk	70	room looked like	10
18	stayed	692	th floor	70	lets just say	10
19	staff	683	desk staff	68	worst hotel ive	10

OPIM 609 Final Assignment¶

October 10th, 2022¶

SAXA X¶

Contents ¶

Preparation¶

Packages ¶

Functions ¶

Load data ¶

Explore ¶

Hotel ¶

Reviews ¶

Clean ¶

Segment by good/bad hotels ¶

Word Clouds ¶

Good Reviews ¶

Bad Reviews ¶

Topic Modeling ¶

Good Reviews ¶

Bad Reviews ¶

N-grams ¶

Good Reviews ¶

Bad Reviews ¶