Done analyzing

This commit is contained in:
Ian Shih
2023-06-15 12:33:33 +08:00
parent c746cdd016
commit 0e4280a1ae
15 changed files with 151 additions and 10 deletions

View File

@@ -12,7 +12,7 @@ videoList = []
isOfficeList = [] isOfficeList = []
for i in result: for i in result:
if 'video' not in result[i]: if 'video' not in result[i] or result[i]['video'] == 0:
continue continue
fanList.append(result[i]['fan']) fanList.append(result[i]['fan'])
viewList.append(result[i]['view']) viewList.append(result[i]['view'])

View File

@@ -1,15 +1,69 @@
import pandas as pd import pandas as pd
import seaborn as sns import seaborn as sns
from matplotlib import pyplot from matplotlib import pyplot as plt
data = pd.read_csv("result.csv") data = pd.read_csv("result.csv")
individual = data.loc[data['isOffice'] == 0] data['viewRate'] = (data['view'] / data['video']) / data['fan']
ind = data.loc[data['isOffice'] == 0]
office = data.loc[data['isOffice'] == 1] office = data.loc[data['isOffice'] == 1]
print(individual['fan']) # Question 1
print(office['fan']) sns.histplot(data=ind, x='fan', bins=100)
plt.savefig("q1-ind-fan-hist.png"); plt.show()
print(list(individual['fan'])) sns.histplot(data=office, x='fan', bins=100)
pyplot.hist(list(individual['fan'])) plt.savefig("q1-office-fan-hist.png"); plt.show()
pyplot.show()
sns.boxplot(data=[ind['fan'], office['fan']], orient='h')
plt.savefig("q1-fan-box.png"); plt.show()
# Question 2
ind = ind.sort_values(
by=['viewRate']
).iloc[
len(ind.index)*5//100:len(ind.index)*95//100
]
office = office.sort_values(
by=['viewRate']
).iloc[
len(office.index)*5//100:len(office.index)*95//100
]
sns.histplot(data=ind, x='viewRate', bins=100)
plt.savefig("q2-ind-viewRate-hist.png"); plt.show()
sns.histplot(data=office, x='viewRate', bins=100)
plt.savefig("q2-office-viewRate-hist.png"); plt.show()
sns.boxplot(data=[ind['viewRate'], office['viewRate']], orient='h')
plt.savefig("q2-viewRate-box.png"); plt.show()
# Question 3
firstHalf = data.iloc[lambda x: x.index < len(data.index)//2]
firstHalf = firstHalf.sort_values(
by=['viewRate']
).iloc[
len(firstHalf.index)*5//100:len(firstHalf.index)*95//100
]
secondHalf = data.iloc[lambda x: x.index >= len(data.index)//2]
secondHalf = secondHalf.sort_values(
by=['viewRate']
).iloc[
len(secondHalf.index)*5//100:len(secondHalf.index)*95//100
]
sns.histplot(data=firstHalf, x='viewRate', bins=100)
plt.savefig("q3-firstHalf-viewRate-hist.png"); plt.show()
sns.histplot(data=secondHalf, x='viewRate', bins=100)
plt.savefig("q3-secondHalf-viewRate-hist.png"); plt.show()
sns.boxplot(
data=[firstHalf['viewRate'], secondHalf['viewRate']],
orient='h'
)
plt.savefig("q3-viewRate-box.png"); plt.show()

View File

@@ -168,3 +168,5 @@
/user/76A02F61A1A3D5DC_ade6ca /user/76A02F61A1A3D5DC_ade6ca
/user/F3E4122B2898728F_ffdb84 /user/F3E4122B2898728F_ffdb84
/user/85DB6A0B1AB05669_fad5be /user/85DB6A0B1AB05669_fad5be
/user/C85DEA9C8F04CDE7_9f25dd
/user/E17242FA0A1578B4_922114

View File

@@ -1,6 +1,5 @@
import requests, random, re, time, json import requests, random, re, time, json
from bs4 import BeautifulSoup as bs from bs4 import BeautifulSoup as bs
from selenium import webdriver
import pandas as pd import pandas as pd
user_agents_list = open("user-agent.txt", 'r').read().split('\n') user_agents_list = open("user-agent.txt", 'r').read().split('\n')

View File

@@ -1,6 +1,5 @@
import requests, random, re, time, json import requests, random, re, time, json
from bs4 import BeautifulSoup as bs from bs4 import BeautifulSoup as bs
from selenium import webdriver
import pandas as pd import pandas as pd
user_agents_list = open("user-agent.txt", 'r').read().split('\n')[:-1] user_agents_list = open("user-agent.txt", 'r').read().split('\n')[:-1]

BIN
q1-fan-box.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 8.6 KiB

BIN
q1-ind-fan-hist.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 12 KiB

BIN
q1-office-fan-hist.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 12 KiB

BIN
q2-ind-viewRate-hist.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 13 KiB

BIN
q2-office-viewRate-hist.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 12 KiB

BIN
q2-viewRate-box.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 9.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 12 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 11 KiB

BIN
q3-viewRate-box.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 9.2 KiB

87
statistics-test.py Normal file
View File

@@ -0,0 +1,87 @@
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import statistics as st
from scipy import stats
from statsmodels.stats.weightstats import ztest
import numpy as np
data = pd.read_csv("result.csv")
data['viewRate'] = (data['view'] / data['video']) / data['fan']
ind = data.loc[data['isOffice'] == 0]
office = data.loc[data['isOffice'] == 1]
# Question 1
test11 = ztest(
ind['fan'],
office['fan'],
alternative='smaller'
)
test12 = ztest(
ind['fan'],
office['fan'],
alternative='larger'
)
print(test11)
print(test12)
# Question 2
ind = ind.sort_values(
by=['viewRate']
).iloc[
len(ind.index)*5//100:len(ind.index)*95//100
]
office = office.sort_values(
by=['viewRate']
).iloc[
len(office.index)*5//100:len(office.index)*95//100
]
test21 = ztest(
ind['viewRate'],
office['viewRate'],
alternative='smaller'
)
test22 = ztest(
ind['viewRate'],
office['viewRate'],
alternative='larger'
)
print(test21)
print(test22)
# Question 3
firstHalf = data.iloc[lambda x: x.index < len(data.index)//2]
firstHalf = firstHalf.sort_values(
by=['viewRate']
).iloc[
len(firstHalf.index)*5//100:len(firstHalf.index)*95//100
]
secondHalf = data.iloc[lambda x: x.index >= len(data.index)//2]
secondHalf = secondHalf.sort_values(
by=['viewRate']
).iloc[
len(secondHalf.index)*5//100:len(secondHalf.index)*95//100
]
test31 = ztest(
firstHalf['viewRate'],
secondHalf['viewRate'],
alternative='smaller'
)
test32 = ztest(
firstHalf['viewRate'],
secondHalf['viewRate'],
alternative='larger'
)
print(test31)
print(test32)