一、背景介绍
Adventure Works Cycles是Adventure Works样本数据库所虚构的公司,这是一家大型跨国制造公司。该公司生产和销售自行车到北美,欧洲和亚洲的商业市场。虽然其基地业务位于华盛顿州博塞尔,拥有290名员工,但几个区域销售团队遍布整个市场。
二、分析目的
分别从整体,地域,产品,用户以及热销产品五个方面展开分析,为公司的制造和销售提供指导性建议,以增加公司的收益。
三、分析框架
四、分析过程
准备工作:
# 导入模块
import pandas as pd
import numpy as np
1. 自行车的整体销售表现情况
1.1 自行车整体销量表现
# 导入数据
gather_customer_order=pd.read_csv("dw_customer_order.csv")
gather_customer_order.info()
#增加create_year_month月份字段。按月维度分析时使用
gather_customer_order['create_year_month']=gather_customer_order['create_date'].apply(lambda x:x.strftime('%Y-%m'))
#筛选产品类别为自行车的数据
gather_customer_order = gather_customer_order[gather_customer_order['cplb_zw'] == '自行车']
#获取每月订单数量和销售金额,用groupby创建一个新的对象,需要将order_num、sum_amount求和
overall_sales_performance = gather_customer_order.groupby('create_year_month').agg({'order_num':'sum','sum_amount':'sum'}).sort_index(ascending = False).reset_index()
overall_sales_performance.head()
# 求销量的环比:
# 增加一列将销量按照日期顺序向后移动与当月的销量相除求得环比
order_num_diff = list(overall_sales_performance['order_num'].diff()/overall_sales_performance['order_num']/-1)
order_num_diff.pop(0)
order_num_diff.append(0)
#将环比转化为DataFrame,添加至表中
overall_sales_performance = pd.concat([overall_sales_performance,pd.DataFrame({'order_num_diff':order_num_diff})],axis = 1)
overall_sales_performance.head()
1.2 自行车整体销售额表现
#求每月自行车销售金额环比
sum_amount_diff = list(overall_sales_performance['sum_amount'].diff()/overall_sales_performance['sum_amount']/-1)
sum_amount_diff.pop(0) #删除列表中第一个元素
sum_amount_diff.append(0) #将0新增到列表末尾
sum_amount_diff
#将环比转化为DataFrame,添加至表中
overall_sales_performance = pd.concat([overall_sales_performance,pd.DataFrame({'sum_amount_diff':sum_amount_diff})],axis = 1)
#查看每月自行车订单量、销售金额、环比
overall_sales_performance.head(5)
1.3 导出数据并使用PowerBI作图
overall_sales_performance.to_csv('pt_overall_sale_performance_1.csv')
2. 2019年11月自行车地域销售表现
2.1 2019年10/11月区域自行车销售情况
筛选出10月与11月记录,对区域与月份分组聚合
#筛选10月11月自行车数据
gather_customer_order_10_11 = gather_customer_order[gather_customer_order['create_year_month'].isin(['2019-10','2019-11'])]
#按照区域、月分组,订单量求和,销售金额求和
gather_customer_order_10_11_group= gather_customer_order_10_11.groupby(['chinese_territory','create_year_month']).agg({'order_num':'sum','sum_amount':'sum'}).reset_index()
根据区域对数据进行偏移求各区域销量与销售额环比
#将区域存为列表
region_list=list(gather_customer_order_10_11_group['chinese_territory'].drop_duplicates())
order_x = pd.Series([])
amount_x = pd.Series([])
for i in region_list:
a=gather_customer_order_10_11_group.loc[gather_customer_order_10_11_group['chinese_territory']==i]['order_num'].pct_change().fillna(0)
b=gather_customer_order_10_11_group.loc[gather_customer_order_10_11_group['chinese_territory']==i]['sum_amount'].pct_change().fillna(0)
order_x=order_x.append(a)
amount_x = amount_x.append(b)
# 将销售量环比与销售额环比列表添加到表中
gather_customer_order_10_11_group['order_diff']=order_x
gather_customer_order_10_11_group['amount_diff']=amount_x
gather_customer_order_10_11_group.head()
导出数据并用PowerBI作图
gather_customer_order_10_11_group.to_csv('pt_bicy_november_territory_2.csv')
2.2 2019年11月自行车销售量TOP10城市环比
#筛选11月自行车交易数据
gather_customer_order_11 = gather_customer_order[gather_customer_order['create_year_month'] == '2019-11'][['chinese_city','order_num']]
# 根据城市对数据分组聚合
gather_customer_order_city_11= gather_customer_order_11.groupby('chinese_city').agg({'order_num':'sum'}).reset_index()
# 根据订单销量进行排序,并取Top10
gather_customer_order_city_head = gather_customer_order_city_11.sort_values(by = 'order_num',ascending = False).head(10)
gather_customer_order_city_head
将Top10的城市名称作为筛选条件,筛选出10/11月自行车销量情况
gather_customer_order_10_11_head = gather_customer_order_10_11[gather_customer_order_10_11['chinese_city'].isin(list(gather_customer_order_city_head['chinese_city']))]
#根据城市与月份分组,计算自行车销售数量销售金额
gather_customer_order_city_10_11 = gather_customer_order_10_11_head[['chinese_city','create_year_month','order_num','sum_amount']]
gather_customer_order_city_10_11 = gather_customer_order_city_10_11.groupby(['chinese_city','create_year_month']).agg({'order_num':'sum','sum_amount':'sum'}).reset_index()
计算Top10城市自行车销量与销售额的环比情况
city_top_list = list(gather_customer_order_city_10_11['chinese_city'].unique())
order_top_x = pd.Series([])
amount_top_x = pd.Series([])
# 遍历各城市,计算各城市销量与销售额环比
for i in city_top_list:
a=gather_customer_order_city_10_11[gather_customer_order_city_10_11['chinese_city'] == i]['order_num'].pct_change().fillna(0)
b=gather_customer_order_city_10_11[gather_customer_order_city_10_11['chinese_city'] == i]['sum_amount'].pct_change().fillna(0)
order_top_x = order_top_x.append(a)
amount_top_x = amount_top_x.append(b)
# 将销量及销售额环比列加入表中
gather_customer_order_city_10_11['order_diff']=order_top_x
gather_customer_order_city_10_11['amount_diff']=amount_top_x
gather_customer_order_city_10_11.head()
导出数据并用PowerBI作图
gather_customer_order_city_10_11.to_csv('pt_bicy_november_october_city_3.csv')
3. 2019年11月自行车产品销售表现
3.1 各月各类自行车销量占比情况
# 按月分组,对销量聚合
gather_customer_order_group_month = gather_customer_order.groupby('create_year_month').agg({'order_num':'sum'}).reset_index()
#合并自行车销售信息表+自行车每月累计销售数量表,pd.merge
order_num_proportion = pd.merge(gather_customer_order,gather_customer_order_group_month,on = 'create_year_month')
#计算自行车销量/自行车每月销量占比
# 后续再作图时,通过对自行车类别的聚合,可以各类别的自行车销量占比
order_num_proportion['order_proportion'] = order_num_proportion['order_num_x']/order_num_proportion['order_num_y']
#重命名sum_month_order:自行车每月销售量
order_num_proportion = order_num_proportion.rename(columns = {'order_num_y':'sum_month_order'})
导出数据并用PowerBI作图
order_num_proportion.to_csv('pt_bicycle_product_sales_month_4.csv')
3.2 公路/山地/旅游自行车细分市场(子类)表现
3.2.1 公路自行车细分(子类)销量情况
公路自行车细分(子类)销量情况
# 公路自行车细分市场销量表现
gather_customer_order_road = gather_customer_order[gather_customer_order['cpzl_zw'] == '公路自行车']
#求公路自行车不同型号产品销售数量
gather_customer_order_road_month = gather_customer_order_road.groupby(by = ['create_year_month','product_name']).agg({'order_num':'sum'}).reset_index()
# 增加子类列
gather_customer_order_road_month['cpzl_zw'] = '公路自行车'
获得公路自行车月销量情况,添加到公路自行车细分销量表中
#每个月公路自行车累计销售数量
gather_customer_order_road_month_sum = gather_customer_order_road_month.groupby('create_year_month').agg({'order_num':'sum'}).reset_index()
#合并公路自行车gather_customer_order_road_month与每月累计销售数量
gather_customer_order_road_month = pd.merge(gather_customer_order_road_month,gather_customer_order_road_month_sum,on = 'create_year_month')
gather_customer_order_road_month.head()
3.2.2 山地自行车细分(子类)销量情况
同公路自行车一样,获取山地自行车子类销量及月销量
# 筛选山地自行车
gather_customer_order_Mountain = gather_customer_order[gather_customer_order['cpzl_zw'] == '山地自行车']
#求山地自行车不同型号产品销售数量
gather_customer_order_Mountain_month = gather_customer_order_Mountain.groupby(['create_year_month','product_name']).agg({'order_num':'sum'}).reset_index()
# 添加列
gather_customer_order_Mountain_month['cpzl_zw'] = '山地自行车'
#每个月公路自行车累计销售数量
gather_customer_order_Mountain_month_sum = gather_customer_order_Mountain_month.groupby('create_year_month').agg({'order_num':'sum'}).reset_index()
#合并山地自行车hz_customer_order_Mountain_month与每月累计销售数量
gather_customer_order_Mountain_month = pd.merge(gather_customer_order_Mountain_month,gather_customer_order_Mountain_month_sum,on = 'create_year_month')
gather_customer_order_Mountain_month.head()
3.2.3 旅游自行车细分(子类)销量情况
gather_customer_order_tour = gather_customer_order[gather_customer_order['cpzl_zw'] == '旅游自行车']
#求旅游自行车不同型号产品销售数量
gather_customer_order_tour_month = gather_customer_order_tour.groupby(['create_year_month','product_name']).agg({'order_num':'sum'}).reset_index()
# 添加自行车子类列
gather_customer_order_tour_month['cpzl_zw'] = '旅游自行车'
# #合并旅游自行车gather_customer_order_road_month与每月累计销售数量
gather_customer_order_tour_month_sum = gather_customer_order_tour_month.groupby('create_year_month').agg({'order_num':'sum'}).reset_index()
gather_customer_order_tour_month = pd.merge(gather_customer_order_tour_month,gather_customer_order_tour_month_sum,on = 'create_year_month')
gather_customer_order_tour_month.head()
3.2.4 将以上公路/山地/旅游自行车表连接
连接三表,同时求各子类月销量占比
#将山地自行车、旅游自行车、公路自行车每月销量信息合并
gather_customer_order_month = pd.concat([gather_customer_order_road_month,gather_customer_order_Mountain_month,gather_customer_order_tour_month],axis = 0)
#各类自行车,销售量占每月自行车总销售量比率
gather_customer_order_month['order_num_proportio'] = gather_customer_order_month['order_num_x']/gather_customer_order_month['order_num_y']
# 重命名
gather_customer_order_month = gather_customer_order_month.rename(columns = {'order_num_x':'order_month_product','order_num_y':'sum_order_month'})
gather_customer_order_month.head()
3.2.5 导出数据并用PowerBI作图
gather_customer_order_month.to_csv('pt_bicycle_product_sales_order_month_4.csv')
3.3 11月自行车细分(子类)的销量情况
3.3.1 从上面的表筛选出10/11月自行车子类的销量占比
#筛选10月11月数据,得到子类占比情况
gather_customer_order_month_10_11 = gather_customer_order_month[gather_customer_order_month.create_year_month.isin(['2019-10','2019-11'])]
#排序
gather_customer_order_month_10_11 = gather_customer_order_month_10_11.sort_values(by = ['product_name','create_year_month'])
gather_customer_order_month_10_11.head(3)
3.3.2 自行车子类环比
# 获取自行车子列的所有product_name,去重生成列表
product_name = list(gather_customer_order_month_10_11.product_name.drop_duplicates())
# 遍历每一种子类,对每种子类求环比
order_top_x = pd.Series([])
for i in product_name:
a = gather_customer_order_month_10_11[gather_customer_order_month_10_11['product_name'] == i]['order_month_product'].pct_change().fillna(0)
order_top_x = order_top_x.append(a)
# 将环比列加入表中
gather_customer_order_month_10_11['order_num_diff'] = order_top_x
gather_customer_order_month_10_11.head()
筛选出11月的信息
#筛选出11月自行车数据
gather_customer_order_month_11 = gather_customer_order_month_10_11[gather_customer_order_month_10_11['create_year_month'] == '2019-11']
gather_customer_order_month_11.head(3)
3.3.3 自行车各子类的累计销量
选取1-11月的自行车销量数据
gather_customer_order_month_1_11 = gather_customer_order_month[gather_customer_order_month['create_year_month'].isin(['2019-01','2019-02','2019-03','2019-04','2019-05','2019-06','2019-07','2019-08','2019-09','2019-10','2019-11'])]
gather_customer_order_month_1_11.head()
求1-11月各子类自行车的累计销量
#计算2019年1月至11月自行车累计销量
gather_customer_order_month_1_11_sum = gather_customer_order_month_1_11.groupby(by = 'product_name').order_month_product.sum().reset_index()
#重命名
gather_customer_order_month_1_11_sum = gather_customer_order_month_1_11_sum.rename(columns = {'order_month_product':'sum_order_1_11'})
gather_customer_order_month_1_11_sum.head()
将11月自行车销量表与1-11月累计销量表关联
gather_customer_order_month_11 = pd.merge(gather_customer_order_month_11,gather_customer_order_month_1_11_sum,on = 'product_name')
gather_customer_order_month_11.head()
导出数据并用PowerBI作图
gather_customer_order_month_11.to_csv('pt_bicycle_product_sales_order_month_11.csv')
4. 2019年11月热品销售分析
4.1 11月产品销量TOP10产品,销售数量及环比
获取销量Top10产品子类
#筛选11月数据
gather_customer_order_11 = gather_customer_order.loc[gather_customer_order['create_year_month'] == '2019-11']
customer_order_11_top10 = gather_customer_order_11.groupby(by = 'product_name').order_num.count().reset_index().\
sort_values(by = 'order_num',ascending = False).head(10)
customer_order_11_top10.head()
获取Top10产品子类的11月环比,基于之前已求的环比表
# 选取列
customer_order_month_10_11 = gather_customer_order_month_10_11[['create_year_month','product_name','order_month_product','cpzl_zw','order_num_diff']]
# 获取到top10的销售情况
customer_order_month_10_11 = customer_order_month_10_11[customer_order_month_10_11['product_name']. isin(list(customer_order_11_top10['product_name']))]
# 添加列
customer_order_month_10_11['category'] = '本月TOP10销量'
customer_order_month_10_11.head()
4.2 11月增速TOP10产品,销售数量及环比
# 基于前面已经求得的销量环比表,筛选出11月,同时对环比进行倒序选出Top10
customer_order_month_11 = gather_customer_order_month_10_11.loc[gather_customer_order_month_10_11['create_year_month'] == '2019-11'].\
sort_values(by = 'order_num_diff',ascending = False).head(10)
# 根据增速Top10的产品子类名称,选出10/11月的销量环比表
customer_order_month_11_top10_seep = gather_customer_order_month_10_11.loc[gather_customer_order_month_10_11['product_name'].isin(list(customer_order_month_11['product_name']))]
# 筛选列
customer_order_month_11_top10_seep = customer_order_month_11_top10_seep[['create_year_month','product_name','order_month_product','cpzl_zw','order_num_diff']]
# 添加列
customer_order_month_11_top10_seep['category'] = '本月TOP10增速'
customer_order_month_11_top10_seep.head()
合并Top10销量表与Top10增速表
hot_products_11 = pd.concat([customer_order_month_10_11,customer_order_month_11_top10_seep],axis = 0)
hot_products_11.tail()
导出数据并用PowerBI作图
hot_products_11.to_csv('pt_hot_products_november.csv')
5. 用户行为分析
获取数据
# 读取数据库客户信息表
df_CUSTOMER_order=pd.read_csv("ods_customer.csv")
# 读取数据库销售订单表
df_sales_orders_11=pd.read_csv("ods_sales_orders.csv")
# 改变数据格式
df_sales_orders_11['customer_key'] = df_sales_orders_11['customer_key'].astype("int")
# 将两表关联
sales_customer_order_11=pd.merge(df_sales_orders_11,df_CUSTOMER,on='customer_key',how='left')
# 通过出生日期,分解选取年份,删除月日
customer_birth_year = sales_customer_order_11['birth_date'].str.split('-',expand = True).rename(columns = {0:'birth_year'}).drop(labels = [1,2],axis = 1)
sales_customer_order_11 = pd.concat([sales_customer_order_11,customer_birth_year],axis = 1)
sales_customer_order_11.info()
5.1 用户年龄分析
通过出生日期计算年龄,同时对年龄分层
# 为birth_year列填充Null值,计算出年龄
sales_customer_order_11['birth_year'].fillna(0,inplace = True)
sales_customer_order_11['birth_year'] = sales_customer_order_11['birth_year'].astype('int')
sales_customer_order_11['customer_age'] = 2019 - sales_customer_order_11['birth_year']
# 根据年龄分层
sales_customer_order_11['age_level'] = pd.cut(sales_customer_order_11['customer_age'],[30,35,40,45,50,55,60,65],labels = ['30-34','35-39','40-44','45-49','50-54','55-59','60-64'])
sales_customer_order_11.head()
添加年龄比率列,对这列根据不同维度聚合可得到不同维度的占比
#筛选销售订单为自行车的订单信息
df_customer_order_bycle = sales_customer_order_11.loc[sales_customer_order_11['cplb_zw'] == '自行车']
# 计算年龄比率
df_customer_order_bycle['age_level_rate'] = 1/len(df_customer_order_bycle)
将年龄再次分组
#将年龄分为3个层次 # 用到了选取行列,然后给行列传值的方法
df_customer_order_bycle.loc[(df_customer_order_bycle['customer_age'] <= 29),'age_level2'] = '<=29'
df_customer_order_bycle.loc[(df_customer_order_bycle['customer_age'] >= 30)&(df_customer_order_bycle['customer_age'] <= 39),'age_level2'] = '30-39'
df_customer_order_bycle.loc[(df_customer_order_bycle['customer_age'] >= 40),'age_level2'] = '>=40'
# 求每个年龄段的人数
age_level2_count = df_customer_order_bycle.groupby(by = 'age_level2').sales_order_key.count().reset_index()
# 连接到大表中
df_customer_order_bycle = pd.merge(df_customer_order_bycle,age_level2_count,on = 'age_level2').rename(columns = {'sales_order_key_y':'age_level2_count'})
df_customer_order_bycle['age_level2_rate'] = 1/df_customer_order_bycle['age_level2_count']
5.2 用户性别分析
按性别分组聚合,添加至大表
df_customer_order_bycle = pd.merge(df_customer_order_bycle,gender_count,on = 'gender').rename(columns = {'cplb_zw_y':'gender_count'})
df_customer_order_bycle['gender_rate'] = 1/df_customer_order_bycle['gender_count']
5.3 导出数据并用PowerBI作图
df_customer_order_bycle.to_csv('pt_user_behavior_november.csv')