Adventure Works Cycles公司数据分析
1 背景
Adventure Works Cycles是AdventureWorks样本数据库所虚构的公司,这是一家大型跨国制造公司。该公司生产和销售金属和复合材料自行车到北美,欧洲和亚洲的商业市场。
Adventure Works Cycle这家公司的客户主要有两种:
• 个体:这些客户购买商品是通过网上零售店铺
• 商店:这些是从Adventure Works Cycles销售代表处购买转售产品的零售店或批发店
这家公司主要有下面四个产品线:
• Adventure Works Cycles 生产的自行车
• 自行车部件,例如车轮,踏板或制动组件
• 从供应商处购买的自行车服装,用于转售给Adventure Works Cycles的客户。
• 从供应商处购买的自行车配件,用于转售给Adventure Works Cycles客户。
2 项目目的
• 将数据导入Hive数据库
• 探索数据库并罗列分析指标
• 汇总数据建立数据仓库(销售主题)
• powerbi可视化
• 制作11月自行车业务分析报告
项目成果部分图片:
ppt成果图片:PPT—11月自行车业务分析报告
3 数据处理
3.1 数据来源
项目数据描述:数据来源于adventure Works Cycles公司的的样本数据库,包括了公司4大应用场景的数据:Sales、Finance、Product、Manufacture
3.2 数据理解
3.3 数据导入
现有资料是一个从sqlserver导出的.sql文件,里面有表名、表字段以及每个表的csv数据,利用文件读写+正则表达式解析出 hive 建表语句,然后利用 pandas 指定编码读取再保存为 csv 文件
1、在hive中创建一个基础数据层库adventure_cj,用来存放基础表数据
2、在python中利用正则表达式循环读取文件,获取表名和字段名
得到以下数据
3、解析 table_info 字典,用来创建表
4、导入数据到hive
5、用遍历文件和pandas 读取数据使之转换成指定 utf8 编码格式的代码
得到utf8编码格式的csv文件并以|分隔
6、建立数据仓库汇总层
(1)创建一个新的数据库 adventure_dw_cj,为了方便查阅将adventure_cj中的基础表迁移过来
use adventure_dw_cj;
# 销售地区维度表
create table dimsalesterritory as select * from adventure_cj.dimsalesterritory;
# 网络销售表
create table FactInternetSales as select * from adventure_cj.FactInternetSales;
# 产品维度表
create table DimProduct as select * from adventure_cj.DimProduct;
(2)建立事实表 fact_time ,对比去年、上个月的销售额、销售额数量的不同
use adventure_dw_cj;
create table fact_time as
SELECT
a.*,
b.amount AS amount_last_year,
b.order_number AS order_number_last_year,
c.amount AS amount_last_month,
c.order_number AS order_number_last_month,
round(((a.amount-c.amount)/c.amount)*100,2) AS amount_comp_last_month,
round(((a.order_number-c.order_number)/c.order_number)*100,2) AS order_number_comp_last_month,
round(((a.amount-b.amount)/b.amount)*100,2) AS amount_comp_last_year,
round(((a.order_number-b.order_number)/b.order_number)*100,2) AS order_number_comp_last_year
FROM (
SELECT
SalesTerritoryKey,
concat_ws('-',substr(orderdatekey,0,4),substr(orderdatekey,5,2),substr(orderdatekey,7,2)) AS orderdate,
year(concat_ws('-',substr(orderdatekey,0,4),substr(orderdatekey,5,2),substr(orderdatekey,7,2))) AS time_YEAR,
QUARTER(concat_ws('-',substr(orderdatekey,0,4),substr(orderdatekey,5,2),substr(orderdatekey,7,2))) AS time_QUARTER,
MONTH(concat_ws('-',substr(orderdatekey,0,4),substr(orderdatekey,5,2),substr(orderdatekey,7,2))) AS time_MONTH,
WEEKofyear(concat_ws('-',substr(orderdatekey,0,4),substr(orderdatekey,5,2),substr(orderdatekey,7,2))) AS time_WEEK,
count( SalesAmount ) AS order_number,
round(count(SalesAmount)*(0.9+rand ()*0.4),2) AS order_number_forcost,
round( sum( SalesAmount ), 2 ) AS amount,
round(sum(SalesAmount)*(0.9+rand ()*0.4),2) AS amount_forcost,
round(sum(SalesAmount)/count(SalesAmount),2) AS customerunitprice,
round( avg( TotalProductCost ), 2 ) AS per_productcost,
round( avg( TaxAmt ), 2 ) AS per_tax,
round( avg( freight ), 2 ) AS avg_freight
FROM
adventure_cj.FactInternetSales
GROUP BY
SalesTerritoryKey,
OrderDateKey
) a
LEFT JOIN (
SELECT
SalesTerritoryKey,
OrderDateKey,
date_add(concat_ws('-',substr(orderdatekey,0,4),substr(orderdatekey,5,2),substr(orderdatekey,7,2)), 365 ) AS orderdate,
count( SalesAmount ) AS order_number,
round( sum( SalesAmount ), 2 ) AS amount
FROM
adventure_cj.FactInternetSales
GROUP BY
SalesTerritoryKey,
OrderDateKey
) b
ON a.SalesTerritoryKey = b.SalesTerritoryKey AND a.orderdate = b.orderdate
LEFT JOIN (
SELECT
SalesTerritoryKey,
OrderDateKey,
date_add(concat_ws('-',substr(orderdatekey,0,4),substr(orderdatekey,5,2),substr(orderdatekey,7,2)),30) AS orderdate,
count( SalesAmount ) AS order_number,
round( sum( SalesAmount ), 2 ) AS amount
FROM
adventure_cj.FactInternetSales
GROUP BY
SalesTerritoryKey,
OrderDateKey
) c
ON a.SalesTerritoryKey = c.SalesTerritoryKey AND a.orderdate = c.orderdate
WHERE
a.orderdate <= current_date()
ORDER BY
a.SalesTerritoryKey,
a.orderdate;
(3)创建事实表 Factinternet,对比销售额、销售数量与目标的完成率分析
use adventure_dw_cj;
create table Factinternet
as
select a.*,round(a.order_number/a.order_number_forcost,2) as order_number_forcost_comp,
round(a.order_number/a.order_number_forcost,2) as amount_forcost_comp
from (
SELECT
a.OrderDatekey as orderdate,
year(concat_ws('-',substr(orderdatekey,0,4),substr(orderdatekey,5,2),substr(orderdatekey,7,2))) AS time_YEAR,
QUARTER(concat_ws('-',substr(orderdatekey,0,4),substr(orderdatekey,5,2),substr(orderdatekey,7,2))) AS time_QUARTER,
MONTH(concat_ws('-',substr(orderdatekey,0,4),substr(orderdatekey,5,2),substr(orderdatekey,7,2))) AS time_MONTH,
WEEKofyear(concat_ws('-',substr(orderdatekey,0,4),substr(orderdatekey,5,2),substr(orderdatekey,7,2))) AS time_WEEK,
a.SalesTerritoryKey,
b.ProductSubcategoryKey,
count( a.CustomerKey ) AS order_number,
round(count(a.CustomerKey)* ( 0.9+rand ( ) * 0.4 ), 2 ) AS order_number_forcost,
round( sum( a.SalesAmount ), 2 ) AS Amount,
round(sum(SalesAmount)*(0.9+rand ()*0.4),2) AS amount_forcost,
round(sum(a.SalesAmount)/count(a.SalesAmount),2) AS customerunitprice,
round( avg( a.TotalProductCost ), 2 ) AS per_productcost,
round( avg( a.TaxAmt ), 2 ) AS per_tax,
round( avg( a.freight ), 2 ) AS avg_freight
FROM
adventure_cj.FactinternetSales a
LEFT JOIN adventure_ods_lan.DimProduct b ON a.ProductKey = b.ProductKey
where concat_ws('-',substr(orderdatekey,0,4),substr(orderdatekey,5,2),substr(orderdatekey,7,2)) <= current_date()
GROUP BY
a.OrderDatekey,
a.SalesTerritoryKey,
b.ProductSubcategoryKey) a;
7、每日定时更新数据
模拟基础数据更新,把每天新的销售数据入库到基础库中,并且更新汇总层的数据
0 1 * * * sh /root/adventure_cj/update_shell/update_data_everyday.sh #每天1点0分执行这个脚本
4 PowerBI展示数据
借助ODBC使用powerBI连接Hive数据库
目的是从以日、月、季、年为时间维度,销售区域来分析Adventure Works Cycles这家公司的销售额、销量、客单价、平均税费、平均运费、销售额完成率、平均成本、销售大区占比以及产品销量
展示连接如下:Adventure_Work