diff --git a/dataProcess.py b/dataProcess.py index 4a316db..ca51093 100644 --- a/dataProcess.py +++ b/dataProcess.py @@ -12,7 +12,7 @@ videoList = [] isOfficeList = [] for i in result: - if 'video' not in result[i]: + if 'video' not in result[i] or result[i]['video'] == 0: continue fanList.append(result[i]['fan']) viewList.append(result[i]['view']) diff --git a/descriptive-analysis.py b/descriptive-analysis.py index cad681b..8e0c894 100644 --- a/descriptive-analysis.py +++ b/descriptive-analysis.py @@ -1,15 +1,69 @@ import pandas as pd import seaborn as sns -from matplotlib import pyplot +from matplotlib import pyplot as plt data = pd.read_csv("result.csv") -individual = data.loc[data['isOffice'] == 0] +data['viewRate'] = (data['view'] / data['video']) / data['fan'] + +ind = data.loc[data['isOffice'] == 0] office = data.loc[data['isOffice'] == 1] -print(individual['fan']) -print(office['fan']) +# Question 1 +sns.histplot(data=ind, x='fan', bins=100) +plt.savefig("q1-ind-fan-hist.png"); plt.show() -print(list(individual['fan'])) -pyplot.hist(list(individual['fan'])) -pyplot.show() +sns.histplot(data=office, x='fan', bins=100) +plt.savefig("q1-office-fan-hist.png"); plt.show() + +sns.boxplot(data=[ind['fan'], office['fan']], orient='h') +plt.savefig("q1-fan-box.png"); plt.show() + + +# Question 2 +ind = ind.sort_values( + by=['viewRate'] +).iloc[ + len(ind.index)*5//100:len(ind.index)*95//100 +] +office = office.sort_values( + by=['viewRate'] +).iloc[ + len(office.index)*5//100:len(office.index)*95//100 +] + +sns.histplot(data=ind, x='viewRate', bins=100) +plt.savefig("q2-ind-viewRate-hist.png"); plt.show() + +sns.histplot(data=office, x='viewRate', bins=100) +plt.savefig("q2-office-viewRate-hist.png"); plt.show() + +sns.boxplot(data=[ind['viewRate'], office['viewRate']], orient='h') +plt.savefig("q2-viewRate-box.png"); plt.show() + + +# Question 3 +firstHalf = data.iloc[lambda x: x.index < len(data.index)//2] +firstHalf = firstHalf.sort_values( + by=['viewRate'] +).iloc[ + len(firstHalf.index)*5//100:len(firstHalf.index)*95//100 +] +secondHalf = data.iloc[lambda x: x.index >= len(data.index)//2] +secondHalf = secondHalf.sort_values( + by=['viewRate'] +).iloc[ + len(secondHalf.index)*5//100:len(secondHalf.index)*95//100 +] + +sns.histplot(data=firstHalf, x='viewRate', bins=100) +plt.savefig("q3-firstHalf-viewRate-hist.png"); plt.show() + +sns.histplot(data=secondHalf, x='viewRate', bins=100) +plt.savefig("q3-secondHalf-viewRate-hist.png"); plt.show() + +sns.boxplot( + data=[firstHalf['viewRate'], secondHalf['viewRate']], + orient='h' +) +plt.savefig("q3-viewRate-box.png"); plt.show() diff --git a/fail.log b/fail.log index b745b25..89a2786 100644 --- a/fail.log +++ b/fail.log @@ -168,3 +168,5 @@ /user/76A02F61A1A3D5DC_ade6ca /user/F3E4122B2898728F_ffdb84 /user/85DB6A0B1AB05669_fad5be +/user/C85DEA9C8F04CDE7_9f25dd +/user/E17242FA0A1578B4_922114 diff --git a/getRanking.py b/getRanking.py index b0faa30..f18dc6f 100644 --- a/getRanking.py +++ b/getRanking.py @@ -1,6 +1,5 @@ import requests, random, re, time, json from bs4 import BeautifulSoup as bs -from selenium import webdriver import pandas as pd user_agents_list = open("user-agent.txt", 'r').read().split('\n') diff --git a/getVideos.py b/getVideos.py index 2b3c4e0..10ed3d7 100644 --- a/getVideos.py +++ b/getVideos.py @@ -1,6 +1,5 @@ import requests, random, re, time, json from bs4 import BeautifulSoup as bs -from selenium import webdriver import pandas as pd user_agents_list = open("user-agent.txt", 'r').read().split('\n')[:-1] diff --git a/q1-fan-box.png b/q1-fan-box.png new file mode 100644 index 0000000..da68ce1 Binary files /dev/null and b/q1-fan-box.png differ diff --git a/q1-ind-fan-hist.png b/q1-ind-fan-hist.png new file mode 100644 index 0000000..e2b5e4b Binary files /dev/null and b/q1-ind-fan-hist.png differ diff --git a/q1-office-fan-hist.png b/q1-office-fan-hist.png new file mode 100644 index 0000000..a34a5e5 Binary files /dev/null and b/q1-office-fan-hist.png differ diff --git a/q2-ind-viewRate-hist.png b/q2-ind-viewRate-hist.png new file mode 100644 index 0000000..5376571 Binary files /dev/null and b/q2-ind-viewRate-hist.png differ diff --git a/q2-office-viewRate-hist.png b/q2-office-viewRate-hist.png new file mode 100644 index 0000000..adbfe61 Binary files /dev/null and b/q2-office-viewRate-hist.png differ diff --git a/q2-viewRate-box.png b/q2-viewRate-box.png new file mode 100644 index 0000000..60fe86a Binary files /dev/null and b/q2-viewRate-box.png differ diff --git a/q3-firstHalf-viewRate-hist.png b/q3-firstHalf-viewRate-hist.png new file mode 100644 index 0000000..3819ebf Binary files /dev/null and b/q3-firstHalf-viewRate-hist.png differ diff --git a/q3-secondHalf-viewRate-hist.png b/q3-secondHalf-viewRate-hist.png new file mode 100644 index 0000000..9c6fefd Binary files /dev/null and b/q3-secondHalf-viewRate-hist.png differ diff --git a/q3-viewRate-box.png b/q3-viewRate-box.png new file mode 100644 index 0000000..8ae87b8 Binary files /dev/null and b/q3-viewRate-box.png differ diff --git a/statistics-test.py b/statistics-test.py new file mode 100644 index 0000000..9550a08 --- /dev/null +++ b/statistics-test.py @@ -0,0 +1,87 @@ +import pandas as pd +import seaborn as sns +from matplotlib import pyplot as plt +import statistics as st +from scipy import stats +from statsmodels.stats.weightstats import ztest +import numpy as np + +data = pd.read_csv("result.csv") + +data['viewRate'] = (data['view'] / data['video']) / data['fan'] + +ind = data.loc[data['isOffice'] == 0] +office = data.loc[data['isOffice'] == 1] + +# Question 1 +test11 = ztest( + ind['fan'], + office['fan'], + alternative='smaller' +) + +test12 = ztest( + ind['fan'], + office['fan'], + alternative='larger' +) + +print(test11) +print(test12) + +# Question 2 +ind = ind.sort_values( + by=['viewRate'] +).iloc[ + len(ind.index)*5//100:len(ind.index)*95//100 +] +office = office.sort_values( + by=['viewRate'] +).iloc[ + len(office.index)*5//100:len(office.index)*95//100 +] + +test21 = ztest( + ind['viewRate'], + office['viewRate'], + alternative='smaller' +) + +test22 = ztest( + ind['viewRate'], + office['viewRate'], + alternative='larger' +) + +print(test21) +print(test22) + + +# Question 3 +firstHalf = data.iloc[lambda x: x.index < len(data.index)//2] +firstHalf = firstHalf.sort_values( + by=['viewRate'] +).iloc[ + len(firstHalf.index)*5//100:len(firstHalf.index)*95//100 +] +secondHalf = data.iloc[lambda x: x.index >= len(data.index)//2] +secondHalf = secondHalf.sort_values( + by=['viewRate'] +).iloc[ + len(secondHalf.index)*5//100:len(secondHalf.index)*95//100 +] + +test31 = ztest( + firstHalf['viewRate'], + secondHalf['viewRate'], + alternative='smaller' +) + +test32 = ztest( + firstHalf['viewRate'], + secondHalf['viewRate'], + alternative='larger' +) + +print(test31) +print(test32)