# import necessary libraries

import numpy as np
import pandas as pd

# load business data
business = pd.read_json('https://drive.usercontent.google.com/download?id=1HGtRB3g1Hx1t1j2vPqCdTEfG-WJtTFVN&confirm=xxx', lines=True)

business = business.dropna(subset=['latitude', 'longitude', 'stars', 'review_count', 'categories']) # this changes nothing though

business.head()

def identify_restaurants(data, keywords):
    keywords = [keyword.lower() for keyword in keywords]
    def check_categories(category):
        return any(keyword in category.lower() for keyword in keywords)
    return data[data['categories'].apply(check_categories)]

keywords = ['Restaurants', 'Coffee & Tea']
business = identify_restaurants(business, keywords)
business

business['city'].value_counts()[:30]

city
Philadelphia        6171
Tampa               3119
Indianapolis        3004
Tucson              2623
Nashville           2612
New Orleans         2369
Edmonton            2321
Saint Louis         1836
Reno                1396
Boise                912
Santa Barbara        812
Clearwater           712
Wilmington           638
St. Louis            573
Metairie             543
Saint Petersburg     523
Franklin             463
St. Petersburg       427
Sparks               363
Brandon              342
Meridian             338
Largo                327
Carmel               314
Cherry Hill          313
West Chester         292
New Port Richey      238
Kenner               236
Goleta               232
Greenwood            221
Fishers              218
Name: count, dtype: int64

universities = pd.read_csv('./yelp_dataset/universities.csv')

universities = universities.set_index('City')

universities

business = business[business['city'].isin(universities.index)]
business['city'].value_counts()

city
Philadelphia     6171
Tampa            3119
Indianapolis     3004
Tucson           2623
Nashville        2612
New Orleans      2369
Edmonton         2321
Reno             1396
Santa Barbara     812
St. Louis         573
Name: count, dtype: int64

# this is a lat long distance calculator from https://community.esri.com/t5/coordinate-reference-systems-blog/distance-on-a-sphere-the-haversine-formula/ba-p/902128#:~:text=All%20of%20these%20can%20be,longitude%20of%20the%20two%20points

def haversine(coord1, coord2):
    import math

    # Coordinates in decimal degrees (e.g. 2.89078, 12.79797)
    lon1, lat1 = coord1
    lon2, lat2 = coord2

    R = 6371000  # radius of Earth in meters
    phi_1 = math.radians(lat1)
    phi_2 = math.radians(lat2)

    delta_phi = math.radians(lat2 - lat1)
    delta_lambda = math.radians(lon2 - lon1)

    a = math.sin(delta_phi / 2.0)**2 + math.cos(phi_1) * math.cos(phi_2) * math.sin(delta_lambda / 2.0)**2

    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

    meters = R * c  # output distance in meters
    km = meters / 1000.0  # output distance in kilometers

    meters = round(meters, 3)
    km = round(km, 3)


#     print(f"Distance: {meters} m")
#     print(f"Distance: {km} km‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍")
    return km

def calc_distance(df):
    new_df = df.copy()
    lat, long = universities.loc[df['city'], ['Latitude', 'Longitude']]
    lat1, long1 = df[['latitude', 'longitude']]
    # threshold 5 km
    new_df['close_to_university'] = haversine((lat, long), (lat1, long1)) < 5
    return new_df

business = business.apply(calc_distance, axis=1)

review = pd.read_json('https://drive.usercontent.google.com/download?id=1xE5dbDWd1Mp8kFQwoMmtLVj5xPq9tpuG&confirm=xxx', lines=True)

review.head()

review.dropna(subset=['stars', 'text'])

def identify_student_reviews(data, keywords):

    keywords = [keyword.lower() for keyword in keywords]


    def check_keywords(review):

        return any(keyword in review.lower() for keyword in keywords)

    data['student_or_not'] = data['text'].apply(check_keywords)
    return data

keywords = ['student', 'students', 'college', 'colleges', 'university', 'universities', 'uni', "univ", "penn", "upenn", "ua", "uarizona", "usf", "purdue", "vanderbilt", "vandy", "vu", "unr", "u of a", "ualberta", "washu", "wustl", "ucsb", "uc"]
review_student = identify_student_reviews(review, keywords)

review_student.head()

review_businesses = pd.merge(review_student, business, how='inner', on='business_id')

review_businesses = review_businesses[['city', 'name', 'stars_x', 'student_or_not', 'stars_y', 'review_count', 'close_to_university']]
review_businesses = review_businesses.rename(columns={'name': 'restaurant_name', 'stars_x': 'rating', 'stars_y': 'avg_rating'})

review_businesses

# import all the packages we need

import seaborn as sns
import matplotlib.pyplot as plt
import patsy
import statsmodels.api as sm

review_businesses[review_businesses['close_to_university'] == True].shape[0]

30248

review_businesses[review_businesses['close_to_university'] == False].shape[0]

14070

review_businesses[review_businesses['close_to_university'] == True]['avg_rating'].mean()

3.8469981486379266

review_businesses[review_businesses['close_to_university'] == False]['avg_rating'].mean()

3.737775408670931

test_statistic = review_businesses[review_businesses['close_to_university'] == True]['avg_rating'].mean() - review_businesses[review_businesses['close_to_university'] == False]['avg_rating'].mean()
test_statistic

0.10922273996699561

def permutation_tests():
    diff_array = list()
    shuffled_df = review_businesses.copy()
    for i in range (1000):
        shuffled_df['close_to_university'] = np.random.permutation(shuffled_df['close_to_university'])
        close = shuffled_df[shuffled_df['close_to_university'] == True]['avg_rating'].mean()
        not_close = shuffled_df[shuffled_df['close_to_university'] == False]['avg_rating'].mean()
        diff_array.append((close-not_close))
    return np.array(diff_array)

results = permutation_tests()
plt.hist(results)
plt.xlabel('Difference in Means')
plt.ylabel('Count')
results.mean()
np.mean(test_statistic < results)

0.0

rb_counts = pd.DataFrame(review_businesses.groupby('city')['close_to_university'].value_counts())
rb_counts

rb_avgs = pd.DataFrame(review_businesses.groupby(['city', 'close_to_university'])['avg_rating'].mean())
rb_avgs

def calc_distance(df):
    new_df = df.copy()
    lat, long = universities.loc[df['city'], ['Latitude', 'Longitude']]
    lat1, long1 = df[['latitude', 'longitude']]
    new_df['distance'] = haversine((lat, long), (lat1, long1))
    return new_df

business_lg = business.apply(calc_distance, axis=1)

business_lg

review_business_lg = pd.merge(review_student, business_lg, how='inner', on='business_id')
review_business_lg = review_business_lg[['city', 'name', 'stars_x', 'student_or_not', 'stars_y', 'review_count', 'close_to_university', 'distance']]
review_business_lg = review_business_lg.rename(columns={'name': 'restaurant_name', 'stars_x': 'rating', 'stars_y': 'avg_rating'})

sns.histplot(data=business_lg['distance'])

f1 = plt.gcf()

business_lg = business_lg.drop(business_lg['distance'].idxmax())

sns.histplot(data=business_lg['distance'])
f1 = plt.gcf()

sns.scatterplot(data=review_business_lg, y='rating', x='distance')

<AxesSubplot:xlabel='distance', ylabel='rating'>

sns.scatterplot(data=review_business_lg, x='distance', y='avg_rating')

<AxesSubplot:xlabel='distance', ylabel='avg_rating'>

outcome, predictors = patsy.dmatrices('rating ~ distance', review_business_lg)
mod = sm.OLS(outcome, predictors)
res_1 = mod.fit()

print(res_1.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 rating   R-squared:                       0.001
Model:                            OLS   Adj. R-squared:                  0.001
Method:                 Least Squares   F-statistic:                     41.19
Date:                Tue, 19 Mar 2024   Prob (F-statistic):           1.40e-10
Time:                        21:53:49   Log-Likelihood:                -74091.
No. Observations:               44318   AIC:                         1.482e+05
Df Residuals:                   44316   BIC:                         1.482e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      3.8933      0.008    459.921      0.000       3.877       3.910
distance      -0.0078      0.001     -6.418      0.000      -0.010      -0.005
==============================================================================
Omnibus:                     5044.150   Durbin-Watson:                   1.701
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             6916.979
Skew:                          -0.962   Prob(JB):                         0.00
Kurtosis:                       2.795   Cond. No.                         9.68
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

outcome_2, predictors_2 = patsy.dmatrices('avg_rating ~ distance', review_business_lg)
mod_2 = sm.OLS(outcome_2, predictors_2)
res_2 = mod_2.fit()

print(res_2.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:             avg_rating   R-squared:                       0.008
Model:                            OLS   Adj. R-squared:                  0.008
Method:                 Least Squares   F-statistic:                     351.9
Date:                Tue, 19 Mar 2024   Prob (F-statistic):           3.34e-78
Time:                        21:53:50   Log-Likelihood:                -36320.
No. Observations:               44318   AIC:                         7.264e+04
Df Residuals:                   44316   BIC:                         7.266e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      3.8591      0.004   1069.028      0.000       3.852       3.866
distance      -0.0098      0.001    -18.758      0.000      -0.011      -0.009
==============================================================================
Omnibus:                     7562.557   Durbin-Watson:                   0.181
Prob(Omnibus):                  0.000   Jarque-Bera (JB):            14489.829
Skew:                          -1.058   Prob(JB):                         0.00
Kurtosis:                       4.835   Cond. No.                         9.68
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

outcome_3, predictors_3 = patsy.dmatrices('stars ~ distance', business_lg)
mod_3 = sm.OLS(outcome_3, predictors_3)
res_3 = mod_3.fit()

print(res_3.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  stars   R-squared:                       0.008
Model:                            OLS   Adj. R-squared:                  0.008
Method:                 Least Squares   F-statistic:                     193.8
Date:                Tue, 19 Mar 2024   Prob (F-statistic):           6.83e-44
Time:                        21:53:50   Log-Likelihood:                -30688.
No. Observations:               24999   AIC:                         6.138e+04
Df Residuals:                   24997   BIC:                         6.140e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      3.6597      0.008    450.330      0.000       3.644       3.676
distance      -0.0135      0.001    -13.922      0.000      -0.015      -0.012
==============================================================================
Omnibus:                     1452.109   Durbin-Watson:                   2.011
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             1721.998
Skew:                          -0.643   Prob(JB):                         0.00
Kurtosis:                       2.995   Cond. No.                         13.1
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

# do linear regression per city, since we only picked one university/college per city
for city in review_business_lg['city'].unique():
    out, pred = patsy.dmatrices('avg_rating ~ distance', review_business_lg[review_business_lg['city']==city])
    mod = sm.OLS(out, pred)
    res = mod.fit()
    print(city, 'distance coef, pvalue:', res.params[1], res.pvalues[1])

Tucson distance coef, pvalue: -0.02348014710517908 1.973349041034627e-29
Philadelphia distance coef, pvalue: 0.002771896479638284 0.060048907543371485
New Orleans distance coef, pvalue: -0.019801151691254182 4.4198834660608146e-13
Santa Barbara distance coef, pvalue: -0.01908444339580161 6.537725936687052e-10
Indianapolis distance coef, pvalue: -0.028078788478805405 2.5607005144326463e-33
Tampa distance coef, pvalue: 0.000914553749047525 0.5695756118487593
Nashville distance coef, pvalue: -0.03489970549495028 8.052083656934554e-101
Reno distance coef, pvalue: -0.06509373125880094 5.080044002626258e-59
Edmonton distance coef, pvalue: -0.044720669841808114 2.2169385782843615e-14
St. Louis distance coef, pvalue: 0.05239540816671164 0.0006505199988430329

for city in business_lg['city'].unique():
    out, pred = patsy.dmatrices('stars ~ distance', business_lg[business_lg['city']==city])
    mod = sm.OLS(out, pred)
    res = mod.fit()
    print(city, 'distance coef, pvalue:', res.params[1], res.pvalues[1])

Philadelphia distance coef, pvalue: -0.021001644673470512 6.938766368582747e-23
Nashville distance coef, pvalue: -0.02282392993556582 6.193023588559855e-12
Indianapolis distance coef, pvalue: -0.027594791297863783 7.873703222892081e-21
Edmonton distance coef, pvalue: -0.0226297324370053 1.2314762213914875e-08
Reno distance coef, pvalue: -0.03741402462439515 4.5181943588653566e-08
Tucson distance coef, pvalue: -0.017036831481580986 1.9837803869667264e-08
Tampa distance coef, pvalue: -0.005786451965124915 0.03325085579968091
Santa Barbara distance coef, pvalue: 0.027875758165070933 3.2117868091776005e-05
New Orleans distance coef, pvalue: -0.02132569106730503 3.8296498449916786e-05
St. Louis distance coef, pvalue: 0.013749277877485347 0.1674120455876262

close_reviews=review_businesses[review_businesses['close_to_university'] == True]
is_student=close_reviews[close_reviews['student_or_not']==True].shape[0]
print(is_student,'of the reviews of restaurants that are close to campus are from students')
not_student=close_reviews[close_reviews['student_or_not']==False].shape[0]
print(not_student, 'of the reviews of restaurants that are close to campus are not from students')

14069 of the reviews of restaurants that are close to campus are from students
16179 of the reviews of restaurants that are close to campus are not from students

avg_rating_student=close_reviews[close_reviews['student_or_not']==True]['avg_rating'].mean()
print(avg_rating_student,'is the average Yelp rating among the reviews from students')
avg_rating_nonstudent=close_reviews[close_reviews['student_or_not']==False]['avg_rating'].mean()
print(avg_rating_nonstudent,' is the average Yelp rating among the reviews from non students')

3.829945269741986 is the average Yelp rating among the reviews from students
3.861827059768836  is the average Yelp rating among the reviews from non students

test_statistic = review_businesses[review_businesses['student_or_not'] == False]['avg_rating'].mean() - review_businesses[review_businesses['student_or_not'] == True]['avg_rating'].mean()
test_statistic

0.02512900573164778

def permutation_tests():
    diff_array = list()
    shuffled_df = review_businesses.copy()
    for i in range (1000):
        shuffled_df['student_or_not'] = np.random.permutation(shuffled_df['student_or_not'])
        student = shuffled_df[shuffled_df['student_or_not'] == True]['avg_rating'].mean()
        nonstudent = shuffled_df[shuffled_df['student_or_not'] == False]['avg_rating'].mean()
        diff_array.append((nonstudent-student))
    return np.array(diff_array)

results = permutation_tests()
plt.hist(results)
plt.xlabel('Difference in Means')
plt.ylabel('Count')
results.mean()
np.mean(test_statistic <= results)

0.0

city_to_university = universities['University Name'].to_dict()

review_businesses['university_name'] = review_businesses['city'].map(city_to_university)

review_businesses

university_city_groups = review_businesses.groupby(['university_name', 'city'])

university_city_averages = []

for (university, city), group in university_city_groups:
    avg_rating_students = group[group['student_or_not'] == True]['rating'].mean()
    avg_rating_nonstudents = group[group['student_or_not'] == False]['rating'].mean()
    
    university_city_averages.append({
        'University Name': university,
        'City': city,
        'Average Rating from Students': avg_rating_students,
        'Average Rating from Non-Students': avg_rating_nonstudents
    })

summary_table = pd.DataFrame(university_city_averages)

summary_table

	business_id	name	address	city	state	postal_code	latitude	longitude	stars	review_count	is_open	attributes	categories	hours
0	Pns2l4eNsfO8kk83dixA6A	Abby Rappoport, LAC, CMQ	1616 Chapala St, Ste 2	Santa Barbara	CA	93101	34.426679	-119.711197	5.0	7	0	{'ByAppointmentOnly': 'True'}	Doctors, Traditional Chinese Medicine, Naturop...	None
1	mpf3x-BjTdTEA3yCZrAYPw	The UPS Store	87 Grasso Plaza Shopping Center	Affton	MO	63123	38.551126	-90.335695	3.0	15	1	{'BusinessAcceptsCreditCards': 'True'}	Shipping Centers, Local Services, Notaries, Ma...	{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ...
2	tUFrWirKiKi_TAnsVWINQQ	Target	5255 E Broadway Blvd	Tucson	AZ	85711	32.223236	-110.880452	3.5	22	0	{'BikeParking': 'True', 'BusinessAcceptsCredit...	Department Stores, Shopping, Fashion, Home & G...	{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ...
3	MTSW4McQd7CbVtyjqoe9mw	St Honore Pastries	935 Race St	Philadelphia	PA	19107	39.955505	-75.155564	4.0	80	1	{'RestaurantsDelivery': 'False', 'OutdoorSeati...	Restaurants, Food, Bubble Tea, Coffee & Tea, B...	{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ...
4	mWMc6_wTdE0EUBKIGXDVfA	Perkiomen Valley Brewery	101 Walnut St	Green Lane	PA	18054	40.338183	-75.471659	4.5	13	1	{'BusinessAcceptsCreditCards': 'True', 'Wheelc...	Brewpubs, Breweries, Food	{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2...

	business_id	name	address	city	state	postal_code	latitude	longitude	stars	review_count	is_open	attributes	categories	hours
3	MTSW4McQd7CbVtyjqoe9mw	St Honore Pastries	935 Race St	Philadelphia	PA	19107	39.955505	-75.155564	4.0	80	1	{'RestaurantsDelivery': 'False', 'OutdoorSeati...	Restaurants, Food, Bubble Tea, Coffee & Tea, B...	{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ...
5	CF33F8-E6oudUQ46HnavjQ	Sonic Drive-In	615 S Main St	Ashland City	TN	37015	36.269593	-87.058943	2.0	6	1	{'BusinessParking': 'None', 'BusinessAcceptsCr...	Burgers, Fast Food, Sandwiches, Food, Ice Crea...	{'Monday': '0:0-0:0', 'Tuesday': '6:0-22:0', '...
8	k0hlBqXX-Bt0vf1op7Jr1w	Tsevi's Pub And Grill	8025 Mackenzie Rd	Affton	MO	63123	38.565165	-90.321087	3.0	19	0	{'Caters': 'True', 'Alcohol': 'u'full_bar'', '...	Pubs, Restaurants, Italian, Bars, American (Tr...	None
9	bBDDEgkFA1Otx9Lfe7BZUQ	Sonic Drive-In	2312 Dickerson Pike	Nashville	TN	37207	36.208102	-86.768170	1.5	10	1	{'RestaurantsAttire': ''casual'', 'Restaurants...	Ice Cream & Frozen Yogurt, Fast Food, Burgers,...	{'Monday': '0:0-0:0', 'Tuesday': '6:0-21:0', '...
11	eEOYSgkmpB90uNA7lDOMRA	Vietnamese Food Truck		Tampa Bay	FL	33602	27.955269	-82.456320	4.0	10	1	{'Alcohol': ''none'', 'OutdoorSeating': 'None'...	Vietnamese, Food, Restaurants, Food Trucks	{'Monday': '11:0-14:0', 'Tuesday': '11:0-14:0'...
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
150327	cM6V90ExQD6KMSU3rRB5ZA	Dutch Bros Coffee	1181 N Milwaukee St	Boise	ID	83704	43.615401	-116.284689	4.0	33	1	{'WiFi': ''free'', 'RestaurantsGoodForGroups':...	Cafes, Juice Bars & Smoothies, Coffee & Tea, R...	{'Monday': '0:0-0:0', 'Tuesday': '0:0-17:0', '...
150328	1jx1sfgjgVg0nM6n3p0xWA	Savaya Coffee Market	11177 N Oracle Rd	Oro Valley	AZ	85737	32.409552	-110.943073	4.5	41	1	{'BusinessParking': '{'garage': False, 'street...	Specialty Food, Food, Coffee & Tea, Coffee Roa...	{'Monday': '0:0-0:0', 'Tuesday': '6:0-14:0', '...
150336	WnT9NIzQgLlILjPT0kEcsQ	Adelita Taqueria & Restaurant	1108 S 9th St	Philadelphia	PA	19147	39.935982	-75.158665	4.5	35	1	{'WheelchairAccessible': 'False', 'Restaurants...	Restaurants, Mexican	{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'...
150339	2O2K6SXPWv56amqxCECd4w	The Plum Pit	4405 Pennell Rd	Aston	DE	19014	39.856185	-75.427725	4.5	14	1	{'RestaurantsDelivery': 'False', 'BusinessAcce...	Restaurants, Comfort Food, Food, Food Trucks, ...	{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...
150340	hn9Toz3s-Ei3uZPt7esExA	West Side Kebab House	2470 Guardian Road NW	Edmonton	AB	T5T 1K8	53.509649	-113.675999	4.5	18	0	{'Ambience': '{'touristy': False, 'hipster': F...	Middle Eastern, Restaurants	{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'...

	Country	State	University Name	Latitude	Longitude
City
Philadelphia	USA	Pennsylvania	University of Pennsylvania	39.952583	-75.191975
Tucson	USA	Arizona	University of Arizona Tuscon	32.221664	-110.948922
Tampa	USA	Florida	University of South Florida	28.051836	-82.400005
Indianapolis	USA	Indiana	Purdue University	39.776709	-86.170811
Nashville	USA	Tennessee	Vanderbilt University	36.145532	-86.804060
New Orleans	USA	Louisiana	Tulane University	29.958586	-90.064997
Reno	USA	Nevada	University of Nevada	39.534642	-119.812831
Edmonton	Canada	Alberta	University of Alberta	53.523219	-113.523219
St. Louis	USA	Missouri	Washington University in St. Louis	38.628900	-90.307200
Santa Barbara	USA	California	UC Santa Barbara	34.413953	-119.848956

	review_id	user_id	business_id	stars	useful	cool	text	date
0	KU_O5udG6zpxOg-VcAEodg	mh_-eMZ6K5RLWhZyISBhwA	XQfwVwDr-v0ZS3_CbbE5Xw	3	0	0	If you decide to eat here, just be aware it is...	2018-07-07 22:09:11
1	BiTunyQ73aT9WBnpR9DZGw	OyoGAe7OKpv6SyGZT5g77Q	7ATYjTIgM3jUlt4UM3IypQ	5	1	1	I've taken a lot of spin classes over the year...	2012-01-03 15:28:18
2	saUsX_uimxRlCVr67Z4Jig	8g_iMtfSiwikVnbP2etR0A	YjUWPpI6HXG530lwP-fb2A	3	0	0	Family diner. Had the buffet. Eclectic assortm...	2014-02-05 20:30:30
3	AqPFMleE6RsU23_auESxiA	_7bHUi9Uuf5__HHc_Q8guQ	kxX2SOes4o-D3ZQBkiMRfA	5	1	1	Wow! Yummy, different, delicious. Our favo...	2015-01-04 00:01:03
4	Sx8TMOWLNuJBWer-0pcmoA	bcjbaE6dDog4jkNY91ncLQ	e4Vwtrqf-wpJfwesgvdgxQ	4	1	1	Cute interior and owner (?) gave us tour of up...	2017-01-14 20:54:15

	review_id	user_id	business_id	stars	useful	funny	cool	text	date
0	KU_O5udG6zpxOg-VcAEodg	mh_-eMZ6K5RLWhZyISBhwA	XQfwVwDr-v0ZS3_CbbE5Xw	3	0	0	0	If you decide to eat here, just be aware it is...	2018-07-07 22:09:11
1	BiTunyQ73aT9WBnpR9DZGw	OyoGAe7OKpv6SyGZT5g77Q	7ATYjTIgM3jUlt4UM3IypQ	5	1	0	1	I've taken a lot of spin classes over the year...	2012-01-03 15:28:18
2	saUsX_uimxRlCVr67Z4Jig	8g_iMtfSiwikVnbP2etR0A	YjUWPpI6HXG530lwP-fb2A	3	0	0	0	Family diner. Had the buffet. Eclectic assortm...	2014-02-05 20:30:30
3	AqPFMleE6RsU23_auESxiA	_7bHUi9Uuf5__HHc_Q8guQ	kxX2SOes4o-D3ZQBkiMRfA	5	1	0	1	Wow! Yummy, different, delicious. Our favo...	2015-01-04 00:01:03
4	Sx8TMOWLNuJBWer-0pcmoA	bcjbaE6dDog4jkNY91ncLQ	e4Vwtrqf-wpJfwesgvdgxQ	4	1	0	1	Cute interior and owner (?) gave us tour of up...	2017-01-14 20:54:15
...	...	...	...	...	...	...	...	...	...
99995	pAEbIxvr6ebx2bHc1XvguA	SMH5CeiLvKx61lKwtLZ_PA	lV0k3BnslFRkuWD_kbKd0Q	4	0	0	0	Came here for lunch with a group. They were bu...	2018-05-30 22:28:56
99996	xH1AoE-4nf2ECGQJRjO4_g	2clTdtp-BjphxLjN83CpUA	G0xz3kyRhRi6oZl7KfR0pA	1	1	0	0	The equipment is so old and so felty! I just u...	2015-04-05 23:31:52
99997	GatIbXTz-WDru5emONUSIg	MRrN6DH3QGCFcDv5RENYVg	C4lZdhasjZVQyDlOiXY1sA	4	0	0	0	This is one of my favorite Mexican restaurants...	2016-06-04 00:59:15
99998	6NfkodAdhvI89xONXuBC3A	rnNQzeKJbvqVCsYsL10mkQ	dChRGpit9fM_kZK5pafNyA	2	0	0	0	Came here for brunch - had an omlette ($19 + t...	2018-06-11 12:45:08
99999	sJ1BMq7lkKgOWEFx3n6ZRw	_BcWyKQL16ndpBdggh2kNA	hMcgO98QaOFmQVTfCUeGzw	5	0	0	0	Came in for my 5-6 month prophy and saw Kara -...	2013-06-06 10:10:33

The Analysis of Restaurant Ratings and Their Locations¶

Abstract¶

Research Question¶

Background and Prior Work¶

Hypothesis¶

Data¶

Yelp Academic Dataset¶

Results¶

Exploratory Data Analysis¶

Average Ratings of Close vs Far Businesses¶

Linear Regression on Distance vs Ratings¶

Student vs Non-student Reviews¶

Ethics & Privacy¶

Discussion and Conclusion¶

Team Contributions¶

	city	restaurant_name	rating	student_or_not	avg_rating	review_count	close_to_university
0	Tucson	Kettle Restaurant	3	True	3.5	47	True
1	Tucson	Kettle Restaurant	2	True	3.5	47	True
2	Tucson	Kettle Restaurant	5	False	3.5	47	True
3	Tucson	Kettle Restaurant	5	False	3.5	47	True
4	Tucson	Kettle Restaurant	3	False	3.5	47	True
...	...	...	...	...	...	...	...
44313	Edmonton	Dairy Queen Grill & Chill	1	True	2.0	6	True
44314	Tampa	Grand China	5	False	3.5	19	False
44315	Philadelphia	Dough Boy Pizza	5	False	4.5	11	False
44316	Tucson	Burger King	3	False	1.5	21	True
44317	Edmonton	Versato's Pizza	5	False	4.5	24	False

		count
city	close_to_university
Edmonton	True	786
Edmonton	False	309
Indianapolis	True	2382
Indianapolis	False	1290
Nashville	True	3874
Nashville	False	1549
New Orleans	True	7911
New Orleans	False	508
Philadelphia	True	9820
Philadelphia	False	2797
Reno	True	2271
Reno	False	664
Santa Barbara	False	2125
Santa Barbara	True	2
St. Louis	False	306
St. Louis	True	79
Tampa	False	3385
Tampa	True	768
Tucson	True	2355
Tucson	False	1137

	University Name	City	Average Rating from Students	Average Rating from Non-Students
0	Purdue University	Indianapolis	3.831654	3.903778
1	Tulane University	New Orleans	3.855110	4.061557
2	UC Santa Barbara	Santa Barbara	3.801944	4.063650
3	University of Alberta	Edmonton	3.608130	3.672917
4	University of Arizona Tuscon	Tucson	3.844771	3.876812
5	University of Nevada	Reno	3.737705	3.984840
6	University of Pennsylvania	Philadelphia	3.751565	3.905575
7	University of South Florida	Tampa	3.674469	3.886010
8	Vanderbilt University	Nashville	3.602155	3.892362
9	Washington University in St. Louis	St. Louis	3.705263	3.635897