读取数据

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

import timeit

import random

import time


sheet_1 = pd.read_csv("1.csv",encoding='gb18030')

sheet_2 = pd.read_csv("2.csv",encoding='gb18030')
excel3 = pd.ExcelFile('3.xlsx')
sheet_3 = pd.read_excel(excel3)

excel4 = pd.ExcelFile('4.xlsx')
sheet_4 = pd.read_excel(excel4)

筛选出未完成的订单

unfinished_1 = sheet_1[sheet_1['is_finished']==0]
unfinished_2 = sheet_2[sheet_2['is_finished']==0]
num_of_turnover = sheet_1.shape[0] + sheet_2.shape[0]
num_of_unfinished = unfinished_1.shape[0] + unfinished_2.shape[0]
labels = 'unfinished', 'turnover'
sizes = [num_of_unfinished , num_of_turnover-num_of_unfinished]
explode = (0, 0.3)  # only "explode" the 2nd slice (i.e. 'Hogs')

fig1, ax1 = plt.subplots()
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

plt.show()

png

剔除掉7.9%的无效数据

sheet_1 = sheet_1[sheet_1['is_finished']!=0]
sheet_2 = sheet_2[sheet_2['is_finished']!=0]
no_cost_prc_1 = sheet_1[sheet_1['sku_cost_prc'] == 0.0]
no_cost_prc_2 = sheet_2[sheet_2['sku_cost_prc'] == 0.0]
num_of_no_cost_prc = no_cost_prc_1.shape[0] + no_cost_prc_2.shape[0] 

64%的记录成本信息缺失

num_of_turnover = sheet_1.shape[0] + sheet_2.shape[0]
labels = 'no cost price', 'turnover'
sizes = [num_of_no_cost_prc , num_of_turnover-num_of_no_cost_prc]
explode = (0, 0.3)  # only "explode" the 2nd slice (i.e. 'Hogs')

fig1, ax1 = plt.subplots()
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

plt.show()

png

sheet = pd.concat([sheet_1,sheet_2])
no_cost_prc = pd.concat([no_cost_prc_1,no_cost_prc_2])
full_info_turnover = sheet[sheet['sku_cost_prc']>0.0]

使用iterrow()方法


for index, row in no_cost_prc.iterrows():
    print(index)
    sku_id = row['sku_id']
    sku_sale_prc = row['sku_sale_prc']
    temp = full_info_turnover[
        (full_info_turnover['sku_id'] == sku_id) & (full_info_turnover['sku_sale_prc'] == sku_sale_prc)]
    if temp.empty == False:
        row['sku_cost_prc'] = temp['sku_cost_prc'].values[0]
    else:
        continue

使用Ndarray

sku_sale_prc_arr = full_info_turnover['sku_sale_prc'].values

sku_cost_prc_arr = full_info_turnover['sku_cost_prc'].values

sku_id_arr = full_info_turnover['sku_id'].values

non_sku_cost_arr = no_cost_prc['sku_cost_prc'].values

non_sku_sale_prc_arr = no_cost_prc['sku_sale_prc'].values

non_sku_id_arr = no_cost_prc['sku_id'].values

for i in range(non_sku_cost_arr.shape[0]):
    print(i)
    temp = sku_cost_prc_arr[np.where(sku_id_arr == non_sku_id_arr[i]) and np.where(sku_sale_prc_arr == non_sku_sale_prc_arr[i])]
    if temp.size !=0:
        non_sku_cost_arr[i] = temp[0]
    else:
        continue

使用Merge方法进行匹配

对于能匹配到的信息的商品用信息补全 没有匹配到的取利润率20%到40%间的一个随机数

no_cost_prc = no_cost_prc.drop_duplicates(keep='first')
Merged_info = pd.merge(no_cost_prc,full_info_turnover,how = "left",on = ['sku_id','sku_sale_prc'])
norepeat_df = Merged_info.drop_duplicates(subset=['sku_id','sku_sale_prc','create_dt_x','order_id_x','sku_name_x','is_finished_x','sku_cnt_x','sku_prc_x','sku_cost_prc_x','upc_code_x'], keep='first')
norepeat_df.index = np.arange(norepeat_df.shape[0])
no_cost_prc.index = np.arange(norepeat_df.shape[0])
norepeat_df.iloc[:,8] = norepeat_df.iloc[:,16]
norepeat_df['sku_cost_prc_x'] = norepeat_df['sku_cost_prc_x'].fillna(0)
norepeat_df_filled = norepeat_df[norepeat_df['sku_cost_prc_x'] !=0]
norepeat_df_to_fill = norepeat_df[norepeat_df['sku_cost_prc_x'] ==0]
norepeat_df_to_fill['sku_cost_prc_x'] = norepeat_df_to_fill.apply(lambda x:x['sku_sale_prc']/(random.uniform(0.2, 0.4)+1),axis=1)
norepeat_df_fullfilled = pd.concat([norepeat_df_to_fill,norepeat_df_filled])
norepeat_df_fullfilled = norepeat_df_fullfilled.iloc[:,:10]
ori_tags = ['create_dt','order_id','sku_id','sku_name','is_finished','sku_cnt','sku_prc','sku_sale_prc','sku_cost_prc','upc_code']
norepeat_df_fullfilled.columns=ori_tags
fullfiled_turnovers = pd.concat([norepeat_df_fullfilled,full_info_turnover])
fullfiled_turnovers.index = np.arange(fullfiled_turnovers.shape[0])

展示补全的表

fullfiled_turnovers
create_dt order_id sku_id sku_name is_finished sku_cnt sku_prc sku_sale_prc sku_cost_prc upc_code
0 2016-11-30 628897900000121 2005508919 进口 香蕉 约1kg 1 1 8.8 8.8 6.790909 2400002414566
1 2016-11-30 628897900000121 2004962651 伊利 畅轻风味发酵乳 燕麦+芒果味酸奶 250g 1 1 7.5 7.5 5.365986 6907992103419
2 2016-12-02 629091200000042 2005508964 五花肉 约500g 1 1 13.5 13.5 10.055045 2400002413040
3 2016-12-02 629091200000042 2005508949 青尖椒 约400g 1 1 3.9 3.9 2.933400 2400002414306
4 2016-12-02 629091200000042 2005468473 天马 腐竹 180g 1 1 10.8 10.8 8.702881 6922547652280
... ... ... ... ... ... ... ... ... ... ...
1123427 2019-01-02 900150830000141 2009205131 Richeese/丽芝士 印度尼西亚进口 纳宝帝奶酪威化饼干 350g 1 1 19.9 19.8 19.800000 8.99318e+12
1123428 2019-01-02 900150830000141 2003621597 思念 宁波风味黑芝麻汤圆 1kg 1 1 10.9 9.9 9.900000 6.92167e+12
1123429 2019-01-02 900150830000141 2006240781 巨龙 窝窝米酒+红糖米酒 828g(418g+410g) 1 1 11.8 9.9 9.900000 6.92412e+12
1123430 2019-01-02 900152711000122 2011596565 海天 上等蚝油组合装 520g*2瓶(新旧包装,随机发货) 1 1 15.0 9.9 9.900000 6.90227e+12
1123431 2019-01-02 900152711000122 2006613346 黄瓜山 鲜鸡蛋 10枚?400g 1 1 10.9 9.9 9.900000 6.95612e+12

1123432 rows × 10 columns

Q1 计算该商场从2016年11月30日到2019年1月2日每天的营业额和利润率

Q1_start_time = np.datetime64('2016-11-30')
Q1_end_time = np.datetime64('2019-01-02')
fullfiled_turnovers['create_dt'] = fullfiled_turnovers['create_dt'].astype('datetime64')
Sales_dic = {}
Ori_sales = {}
Profit_rate_dic = {}
flag = 0
def process_sheet(Q1_start_time):
    time_revised_sheet_1 = fullfiled_turnovers[(fullfiled_turnovers['create_dt']==Q1_start_time)]
    Q1_sheet = time_revised_sheet_1[['sku_cnt','sku_prc','sku_sale_prc','sku_cost_prc','upc_code']]
    Q1_sheet['upc_code'] = Q1_sheet.apply(lambda x:x['sku_cnt']*x['sku_sale_prc'],axis=1)
    Q1_sales = Q1_sheet['upc_code'].sum()
    
    Q1_sheet = time_revised_sheet_1[['sku_cnt','sku_prc','sku_sale_prc','sku_cost_prc','upc_code']]
    Q1_sheet['upc_code'] = Q1_sheet.apply(lambda x:x['sku_cnt']*x['sku_cost_prc'],axis=1)
    Q1_costs = Q1_sheet['upc_code'].sum()
    Q1_profit_rate = (Q1_sales - Q1_costs)/Q1_costs
    
    Q1_sheet = time_revised_sheet_1[['sku_cnt','sku_prc','sku_sale_prc','sku_cost_prc','upc_code']]
    Q1_sheet['upc_code'] = Q1_sheet.apply(lambda x:x['sku_cnt']*x['sku_prc'],axis=1)
    Q1_ori_sales = Q1_sheet['upc_code'].sum()


    return Q1_profit_rate,Q1_sales,Q1_costs,Q1_ori_sales

while Q1_start_time<=Q1_end_time:
    
    if (fullfiled_turnovers[(fullfiled_turnovers['create_dt']==Q1_start_time)].empty):
        Sales_dic[flag] = [Q1_start_time,0]
        Profit_rate_dic[flag] = [Q1_start_time,0]
        Q1_start_time = Q1_start_time + 1
        continue
    if not(fullfiled_turnovers[(fullfiled_turnovers['create_dt']==Q1_start_time)].empty):
        Q1_profit_rate,Q1_sales,Q1_costs,Q1_ori_sales = process_sheet(Q1_start_time)
            
    Sales_dic[flag] = [Q1_start_time,Q1_sales]
    Profit_rate_dic[flag] = [Q1_start_time,Q1_profit_rate]
    Ori_sales[flag] = [Q1_start_time,Q1_ori_sales]
    
    Q1_start_time = Q1_start_time + 1
    
    flag += 1

Q2 选取合适的指标衡量商场每天的打折力度,并计算2016年11月30日到2019年1月2日每天的打折力度

用每天的标价和售价的差再除以标价来定义打折力度

Before 2017年3月7日

Profit_rate = pd.DataFrame(Profit_rate_dic).T
Profit_rate.columns = ['date','profit_rate']
Discont_rate_dic = {}
Ori_sales_pd = pd.DataFrame(Ori_sales).T
Ori_sales_pd.columns = ['date','ori_sales']
Sales = pd.DataFrame(Sales_dic).T
Sales.columns = ['date','sale']
Q2_final_sheet = pd.merge(Ori_sales_pd,Sales,on=['date'])
Q2_final_sheet['discount_rate'] = ''
Q2_final_sheet_ = pd.merge(Q2_final_sheet,Profit_rate,on=['date'])
Q2_final_sheet_['discount_rate'] = Q2_final_sheet.apply(lambda x:(x['ori_sales']-x['sale'])/x['ori_sales'],axis=1)
Q2_final_sheet_
date ori_sales sale discount_rate profit_rate
0 2016-11-30 3306.3 2833.7 0.142939 0.0856733
1 2016-12-01 2564 2346.2 0.084945 0.143429
2 2016-12-02 2594.9 2338.3 0.098886 0.123242
3 2016-12-03 3692.6 3270.4 0.114337 0.119463
4 2016-12-04 4319.18 3881.68 0.101292 0.11417
... ... ... ... ... ...
758 2018-12-29 68383.4 60221.9 0.119349 0.108456
759 2018-12-30 172531 150242 0.129189 0.136967
760 2018-12-31 157145 138077 0.121343 0.146125
761 2019-01-01 73650.9 65018 0.117214 0.0982922
762 2019-01-02 35702.4 31072.6 0.129678 0.0993041

763 rows × 5 columns

Q3分析打折力度与商品销售额以及利润率的关系

x = np.arange(0,len(Sales_dic))

fig = plt.figure()
ax=fig.add_subplot(1,1,1)
ax.plot(x,Q2_final_sheet_['sale'].values,'r')
ax.set_title('Sales per Day')
ax.set_xticks([0,760])
ax.set_xticklabels(['2016-11-30','2019-1-2'],fontsize=    'small')
plt.savefig('4.jpg')
plt.show()

png

上图为销售额画图

fig = plt.figure()
ax=fig.add_subplot(1,1,1)
ax.plot(x,Q2_final_sheet_['profit_rate'].values)
ax.set_title('Profit rate per Day')
ax.set_xticks([0,760])
ax.set_xticklabels(['2016-11-30','2019-1-2'],fontsize=    'small')
plt.savefig('4.jpg')
plt.show()

png

x = np.arange(0,len(Profit_rate_dic))


fig = plt.figure()
ax=fig.add_subplot(1,1,1)
ax.plot(x,Q2_final_sheet_['discount_rate'],'r')
ax.set_title('Discount value per Day')
ax.set_xticks([0,760])
ax.set_xticklabels(['2016-11-30','2019-1-2'],fontsize='small')
plt.savefig('4.jpg')
plt.show()

png

上图为打折力度

Q2_final_sheet_['profit_rate'] = Q2_final_sheet_['profit_rate'].astype('float')
Q2_final_sheet_['sale'] = Q2_final_sheet_['sale'].astype('float')
Q2_final_sheet_['ori_sales'] = Q2_final_sheet_['ori_sales'].astype('float')
Q2_final_sheet_.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 763 entries, 0 to 762
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   date           763 non-null    datetime64[ns]
 1   ori_sales      763 non-null    float64       
 2   sale           763 non-null    float64       
 3   discount_rate  763 non-null    float64       
 4   profit_rate    763 non-null    float64       
dtypes: datetime64[ns](1), float64(4)
memory usage: 35.8 KB
Q2_final_sheet_.corr()
ori_sales sale discount_rate profit_rate
ori_sales 1.000000 0.999409 0.391121 -0.342592
sale 0.999409 1.000000 0.371427 -0.329258
discount_rate 0.391121 0.371427 1.000000 -0.555441
profit_rate -0.342592 -0.329258 -0.555441 1.000000

打折力度和销售额

x = Q2_final_sheet_['discount_rate'].values.reshape(-1, 1)
y = Q2_final_sheet_['sale'].values.reshape(-1, 1)

plt.xlim(-0,0.21)
plt.ylim(-100,100000)

plt.scatter(x, y,s=5)
plt.show()

png

打折力度和利润率

x = Q2_final_sheet_['discount_rate'].values.reshape(-1, 1)
y = Q2_final_sheet_['profit_rate'].values.reshape(-1, 1)

plt.scatter(x, y,s=5)
plt.show()

png

打折力度和销售额的关系

from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
linreg.fit(Q2_final_sheet_['discount_rate'].values.reshape(-1, 1), Q2_final_sheet_['sale'].values.reshape(-1, 1))
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

线性回归系数

linreg.intercept_
array([-11890.35672382])
linreg.coef_
array([[350536.31765171]])
x = Q2_final_sheet_['discount_rate'].values.reshape(-1, 1)
y = Q2_final_sheet_['sale'].values.reshape(-1, 1)
x_ = Q2_final_sheet_['sale'].sum()/Q2_final_sheet_.shape[0]

x_

re_sq = Q2_final_sheet_['sale'] - x_

re_sq = re_sq.apply(lambda x:x*x).sum()

yr = linreg.predict(x)

res = np.mean((linreg.predict(x) - y) ** 2)

R_square = 1- res/re_sq

R_square
0.9988701937555191
res
543409378.8085167
re_sq
480975726115.033
x = Q2_final_sheet_['discount_rate'].values.reshape(-1, 1)
y = Q2_final_sheet_['sale'].values.reshape(-1, 1)

plt.xlim(-0,0.21)
plt.ylim(-100,100000)

Y = yr


plt.scatter(x, y,s=5)

plt.plot(x,Y,'r-')
plt.show()

png

打折力度和利润率的关系

x = Q2_final_sheet_['discount_rate'].values.reshape(-1, 1)
y = Q2_final_sheet_['profit_rate'].values.reshape(-1, 1)
linreg_ = LinearRegression()
linreg_.fit(x,y)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

计算出的系数

linreg_.intercept_
array([0.16283051])
linreg_.coef_
array([[-0.37723069]])
yr_ = linreg_.predict(x)

plt.scatter(x, y,s=5)
plt.plot(x,yr_,'r-')

plt.show()

png

```python x_ = Q2final_sheet[‘discount_rate’].sum()/Q2final_sheet.shape[0]

x_

re_sq = Q2final_sheet[‘discount_rate’] - x_

re_sq = re_sq.apply(lambda x:x*x).sum()

yr_ = linreg_.predict(x)

res = np.mean((linreg_.predict(x) - y) ** 2)

R_square = 1- res/re_sq

Q4 考虑大类区分,打折力度和商品销售额以利润率的关系是什么