Done analyzing
@@ -12,7 +12,7 @@ videoList = []
|
|||||||
isOfficeList = []
|
isOfficeList = []
|
||||||
|
|
||||||
for i in result:
|
for i in result:
|
||||||
if 'video' not in result[i]:
|
if 'video' not in result[i] or result[i]['video'] == 0:
|
||||||
continue
|
continue
|
||||||
fanList.append(result[i]['fan'])
|
fanList.append(result[i]['fan'])
|
||||||
viewList.append(result[i]['view'])
|
viewList.append(result[i]['view'])
|
||||||
|
|||||||
@@ -1,15 +1,69 @@
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
import seaborn as sns
|
import seaborn as sns
|
||||||
from matplotlib import pyplot
|
from matplotlib import pyplot as plt
|
||||||
|
|
||||||
data = pd.read_csv("result.csv")
|
data = pd.read_csv("result.csv")
|
||||||
|
|
||||||
individual = data.loc[data['isOffice'] == 0]
|
data['viewRate'] = (data['view'] / data['video']) / data['fan']
|
||||||
|
|
||||||
|
ind = data.loc[data['isOffice'] == 0]
|
||||||
office = data.loc[data['isOffice'] == 1]
|
office = data.loc[data['isOffice'] == 1]
|
||||||
|
|
||||||
print(individual['fan'])
|
# Question 1
|
||||||
print(office['fan'])
|
sns.histplot(data=ind, x='fan', bins=100)
|
||||||
|
plt.savefig("q1-ind-fan-hist.png"); plt.show()
|
||||||
|
|
||||||
print(list(individual['fan']))
|
sns.histplot(data=office, x='fan', bins=100)
|
||||||
pyplot.hist(list(individual['fan']))
|
plt.savefig("q1-office-fan-hist.png"); plt.show()
|
||||||
pyplot.show()
|
|
||||||
|
sns.boxplot(data=[ind['fan'], office['fan']], orient='h')
|
||||||
|
plt.savefig("q1-fan-box.png"); plt.show()
|
||||||
|
|
||||||
|
|
||||||
|
# Question 2
|
||||||
|
ind = ind.sort_values(
|
||||||
|
by=['viewRate']
|
||||||
|
).iloc[
|
||||||
|
len(ind.index)*5//100:len(ind.index)*95//100
|
||||||
|
]
|
||||||
|
office = office.sort_values(
|
||||||
|
by=['viewRate']
|
||||||
|
).iloc[
|
||||||
|
len(office.index)*5//100:len(office.index)*95//100
|
||||||
|
]
|
||||||
|
|
||||||
|
sns.histplot(data=ind, x='viewRate', bins=100)
|
||||||
|
plt.savefig("q2-ind-viewRate-hist.png"); plt.show()
|
||||||
|
|
||||||
|
sns.histplot(data=office, x='viewRate', bins=100)
|
||||||
|
plt.savefig("q2-office-viewRate-hist.png"); plt.show()
|
||||||
|
|
||||||
|
sns.boxplot(data=[ind['viewRate'], office['viewRate']], orient='h')
|
||||||
|
plt.savefig("q2-viewRate-box.png"); plt.show()
|
||||||
|
|
||||||
|
|
||||||
|
# Question 3
|
||||||
|
firstHalf = data.iloc[lambda x: x.index < len(data.index)//2]
|
||||||
|
firstHalf = firstHalf.sort_values(
|
||||||
|
by=['viewRate']
|
||||||
|
).iloc[
|
||||||
|
len(firstHalf.index)*5//100:len(firstHalf.index)*95//100
|
||||||
|
]
|
||||||
|
secondHalf = data.iloc[lambda x: x.index >= len(data.index)//2]
|
||||||
|
secondHalf = secondHalf.sort_values(
|
||||||
|
by=['viewRate']
|
||||||
|
).iloc[
|
||||||
|
len(secondHalf.index)*5//100:len(secondHalf.index)*95//100
|
||||||
|
]
|
||||||
|
|
||||||
|
sns.histplot(data=firstHalf, x='viewRate', bins=100)
|
||||||
|
plt.savefig("q3-firstHalf-viewRate-hist.png"); plt.show()
|
||||||
|
|
||||||
|
sns.histplot(data=secondHalf, x='viewRate', bins=100)
|
||||||
|
plt.savefig("q3-secondHalf-viewRate-hist.png"); plt.show()
|
||||||
|
|
||||||
|
sns.boxplot(
|
||||||
|
data=[firstHalf['viewRate'], secondHalf['viewRate']],
|
||||||
|
orient='h'
|
||||||
|
)
|
||||||
|
plt.savefig("q3-viewRate-box.png"); plt.show()
|
||||||
|
|||||||
2
fail.log
@@ -168,3 +168,5 @@
|
|||||||
/user/76A02F61A1A3D5DC_ade6ca
|
/user/76A02F61A1A3D5DC_ade6ca
|
||||||
/user/F3E4122B2898728F_ffdb84
|
/user/F3E4122B2898728F_ffdb84
|
||||||
/user/85DB6A0B1AB05669_fad5be
|
/user/85DB6A0B1AB05669_fad5be
|
||||||
|
/user/C85DEA9C8F04CDE7_9f25dd
|
||||||
|
/user/E17242FA0A1578B4_922114
|
||||||
|
|||||||
@@ -1,6 +1,5 @@
|
|||||||
import requests, random, re, time, json
|
import requests, random, re, time, json
|
||||||
from bs4 import BeautifulSoup as bs
|
from bs4 import BeautifulSoup as bs
|
||||||
from selenium import webdriver
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
user_agents_list = open("user-agent.txt", 'r').read().split('\n')
|
user_agents_list = open("user-agent.txt", 'r').read().split('\n')
|
||||||
|
|||||||
@@ -1,6 +1,5 @@
|
|||||||
import requests, random, re, time, json
|
import requests, random, re, time, json
|
||||||
from bs4 import BeautifulSoup as bs
|
from bs4 import BeautifulSoup as bs
|
||||||
from selenium import webdriver
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
user_agents_list = open("user-agent.txt", 'r').read().split('\n')[:-1]
|
user_agents_list = open("user-agent.txt", 'r').read().split('\n')[:-1]
|
||||||
|
|||||||
BIN
q1-fan-box.png
Normal file
|
After Width: | Height: | Size: 8.6 KiB |
BIN
q1-ind-fan-hist.png
Normal file
|
After Width: | Height: | Size: 12 KiB |
BIN
q1-office-fan-hist.png
Normal file
|
After Width: | Height: | Size: 12 KiB |
BIN
q2-ind-viewRate-hist.png
Normal file
|
After Width: | Height: | Size: 13 KiB |
BIN
q2-office-viewRate-hist.png
Normal file
|
After Width: | Height: | Size: 12 KiB |
BIN
q2-viewRate-box.png
Normal file
|
After Width: | Height: | Size: 9.1 KiB |
BIN
q3-firstHalf-viewRate-hist.png
Normal file
|
After Width: | Height: | Size: 12 KiB |
BIN
q3-secondHalf-viewRate-hist.png
Normal file
|
After Width: | Height: | Size: 11 KiB |
BIN
q3-viewRate-box.png
Normal file
|
After Width: | Height: | Size: 9.2 KiB |
87
statistics-test.py
Normal file
@@ -0,0 +1,87 @@
|
|||||||
|
import pandas as pd
|
||||||
|
import seaborn as sns
|
||||||
|
from matplotlib import pyplot as plt
|
||||||
|
import statistics as st
|
||||||
|
from scipy import stats
|
||||||
|
from statsmodels.stats.weightstats import ztest
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
data = pd.read_csv("result.csv")
|
||||||
|
|
||||||
|
data['viewRate'] = (data['view'] / data['video']) / data['fan']
|
||||||
|
|
||||||
|
ind = data.loc[data['isOffice'] == 0]
|
||||||
|
office = data.loc[data['isOffice'] == 1]
|
||||||
|
|
||||||
|
# Question 1
|
||||||
|
test11 = ztest(
|
||||||
|
ind['fan'],
|
||||||
|
office['fan'],
|
||||||
|
alternative='smaller'
|
||||||
|
)
|
||||||
|
|
||||||
|
test12 = ztest(
|
||||||
|
ind['fan'],
|
||||||
|
office['fan'],
|
||||||
|
alternative='larger'
|
||||||
|
)
|
||||||
|
|
||||||
|
print(test11)
|
||||||
|
print(test12)
|
||||||
|
|
||||||
|
# Question 2
|
||||||
|
ind = ind.sort_values(
|
||||||
|
by=['viewRate']
|
||||||
|
).iloc[
|
||||||
|
len(ind.index)*5//100:len(ind.index)*95//100
|
||||||
|
]
|
||||||
|
office = office.sort_values(
|
||||||
|
by=['viewRate']
|
||||||
|
).iloc[
|
||||||
|
len(office.index)*5//100:len(office.index)*95//100
|
||||||
|
]
|
||||||
|
|
||||||
|
test21 = ztest(
|
||||||
|
ind['viewRate'],
|
||||||
|
office['viewRate'],
|
||||||
|
alternative='smaller'
|
||||||
|
)
|
||||||
|
|
||||||
|
test22 = ztest(
|
||||||
|
ind['viewRate'],
|
||||||
|
office['viewRate'],
|
||||||
|
alternative='larger'
|
||||||
|
)
|
||||||
|
|
||||||
|
print(test21)
|
||||||
|
print(test22)
|
||||||
|
|
||||||
|
|
||||||
|
# Question 3
|
||||||
|
firstHalf = data.iloc[lambda x: x.index < len(data.index)//2]
|
||||||
|
firstHalf = firstHalf.sort_values(
|
||||||
|
by=['viewRate']
|
||||||
|
).iloc[
|
||||||
|
len(firstHalf.index)*5//100:len(firstHalf.index)*95//100
|
||||||
|
]
|
||||||
|
secondHalf = data.iloc[lambda x: x.index >= len(data.index)//2]
|
||||||
|
secondHalf = secondHalf.sort_values(
|
||||||
|
by=['viewRate']
|
||||||
|
).iloc[
|
||||||
|
len(secondHalf.index)*5//100:len(secondHalf.index)*95//100
|
||||||
|
]
|
||||||
|
|
||||||
|
test31 = ztest(
|
||||||
|
firstHalf['viewRate'],
|
||||||
|
secondHalf['viewRate'],
|
||||||
|
alternative='smaller'
|
||||||
|
)
|
||||||
|
|
||||||
|
test32 = ztest(
|
||||||
|
firstHalf['viewRate'],
|
||||||
|
secondHalf['viewRate'],
|
||||||
|
alternative='larger'
|
||||||
|
)
|
||||||
|
|
||||||
|
print(test31)
|
||||||
|
print(test32)
|
||||||