2018-06-26 code

# -*- coding: utf-8 -*-
"""
Created on Tue Jun 26 16:52:22 2018

@author: hao.wang
"""
import os
import pandas as pd
import sys
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import scipy.signal as signal
from sklearn.preprocessing import scale

cwd = os.getcwd()
# data_dir = 'D:\百分点工作资料\PHM技术资料\_871PHM_贝晓强\871PHM\871PHM\data_dictionary\data1.csv'

data_dir = 'data1.csv'
strPath = data_dir

ori_data = pd.read_csv(strPath)
ori_data = ori_data.sort_values(by=['code',  'time']) 

machine1 = ori_data[ori_data['code'].isin([663201])]
machine2 = ori_data[ori_data['code'].isin([663202])]
machine3 = ori_data[ori_data['code'].isin([663203])]

machine1.to_csv('machine1.csv')
machine2.to_csv('machine2.csv')
machine3.to_csv('machine3.csv')

machine1 = pd.read_csv('machine1.csv')
machine2 = pd.read_csv('machine2.csv')
machine3 = pd.read_csv('machine3.csv')


# delete the non-numerical columns
tmp = machine1.dtypes == 'object' # store if the type is not numerical
for i in range(0, len(tmp)-1):
    if tmp[i] == True:
        del machine1[tmp.index[i]]
        del machine2[tmp.index[i]]
        del machine3[tmp.index[i]]


# def separate(df, n):
#     to_cluster_data = df.loc[:, ('frequency', 'GNOM', 'MP02', 'MP03', 'MP05', 'MP10', 'MP11')]
#     scaled = scale(to_cluster_data, axis=0)
#     scaled[:, 0] = scaled[:, 0]*6
#     estimator = KMeans(n_clusters=n)
#     estimator.fit(scaled)
#     label_pred = estimator.labels_
#     # centroids = estimator.cluster_centers_
#     # inertia = estimator.inertia_
#     return label_pred


# see the frequency of different machines
machine1['frequency'].value_counts()
machine2['frequency'].value_counts()
machine3['frequency'].value_counts()

ori_data['frequency'].value_counts()
machine1['preset_power'].value_counts()


def choose_frequency(machine):
    tmp_count = machine['frequency'].value_counts()
    tmp_count > len(machine)*0.02
    # chooose the frequency that is larger than 0.02
    choose_freq = tmp_count.index[tmp_count > len(machine)*0.02]
    return choose_freq


choose_frequency(machine1)
choose_frequency(machine2)
choose_frequency(machine3)


# use boxplot to show the data
def save_seriesplot(df, name):
    # close the in screen print of the figure
    plt.ioff()
    # define a figure
    f, axs = plt.subplots(27, 2, figsize=(10*2, 27*2))
    axs = axs.ravel() # use ravel to make the tuple flat
    for i in range(0, len(df.iloc[0,:])-1):
        axs[i].plot(df.iloc[:, i].reset_index(drop=True))
        axs[i].set_title(df.columns[i])
    f.savefig(str(name)+'_seriesplot.png')
    # open the on screen print of the figure
    plt.ion()


# define a function to separate diffrent frequency on specific machine
def choose_and_plot(machine, name):
    freq = choose_frequency(machine)
    for item in freq:
        print(item)
        machine_freq = machine[machine['frequency'] == item]
        machine_freq = machine_freq[machine_freq['op_st'] == 112]
        machine_freq.to_csv('freq_' + str(item) + '_'+ name + '.csv')
        save_seriesplot(machine_freq, 'freq_' + str(item) + '_' + name)


choose_and_plot(machine1, 'machine1')
choose_and_plot(machine2, 'machine2')
choose_and_plot(machine3, 'machine3')



len(machine1)


machine_freq['']




# define a function to find the average stable length of a series


# def mean_normal(series):
#     tmp_length = 1  # store the temp normal series length
#     shift_times = 0 # store the total shift times
#     length = 0
#     search_range = len(series)
#     for i in range(1, search_range):
#         if series[i]==series[i-1]:
#             tmp_length += 1
#         else:
#             shift_times += 1
#             length = length/shift_times*(shift_times-1) + tmp_length/shift_times*1
#             # print(tmp_length)
#             # print(length)
#             tmp_length = 1
#     # do again for the last patch since there is not shift
#     shift_times += 1
#     return length/shift_times*(shift_times-1) + tmp_length/shift_times*1
#
#
# mean_normal(label_pred)
#
#
# # define my function to smooth abnormal shift
# def smooth_abnormal(series):
#     length = mean_normal(series)
#     radius = int(length/2)
#     series_new = series
#     # then we try to smooth the abnormal shifts:
#     for i in range(0, len(series)):
#         if series[i] != series[i-radius] and series[i] != series[i+radius]:
#             series_new[i] = series[i-radius]
#     return series_new
#
#
# # label_smooth = smooth_abnormal(label_pred)
# label_smooth = signal.medfilt(label_pred, 999)
#
# f, axs = plt.subplots(4, 1, figsize=(10, 10))
# axs = axs.ravel()  # use ravel to make the tuple flat
# axs[0].plot(label_pred)
# axs[0].set_title('label_pred')
# axs[1].plot(label_smooth)
# axs[1].set_title('label_smooth')
# axs[2].plot(machine1['frequency'])
# axs[2].set_title('frequency')
#
#
# machine1['label'] = label_smooth
# machine1['label'] = label_pred

len(machine1) + len(machine2) + len(machine3) == len(ori_data)


del ori_data


        

machine1.columns


# separate the machine data into different frequency

# machine1_frequency_1 = machine1[machine1['label'].isin(['0'])]
# machine1_frequency_2 = machine1[machine1['label'].isin(['1'])]
# machine1_frequency_3 = machine1[machine1['label'].isin(['2'])]
#
# machine1_frequency_1 = machine1_frequency_1[machine1_frequency_1['op_st'].isin([112])]
# machine1_frequency_2 = machine1_frequency_2[machine1_frequency_2['op_st'].isin([112])]
# machine1_frequency_3 = machine1_frequency_3[machine1_frequency_3['op_st'].isin([112])]


# make some summarize about the machines data
def save_quant(df, name):
    # make a copy of original stdout route
    stdout_backup = sys.stdout
    # define the log file that receives your log info
    log_file = open(str(name) + "_message.log", "w")
    # redirect print output to log file
    sys.stdout = log_file
    # begin print 
    for item in df.columns:
        print(df[item].describe())
    # close the file
    log_file.close()
    # restore the output to initial pattern
    sys.stdout = stdout_backup


save_quant(machine1, 'machine1')
save_quant(machine2, 'machine2')
save_quant(machine3, 'machine3')


# use boxplot to show the data   
def save_boxplot(df, name):
    # close the in screen print of the figure 
    plt.ioff() 
    # define a figure
    f, axs = plt.subplots(6, 9, figsize=(6*6 , 9*6))
    axs = axs.ravel() # use ravel to make the tuple flat
    for i in range(0, len(df.iloc[0, :])-1):
        axs[i].boxplot(df.iloc[:, i])
        axs[i].set_title(df.columns[i])
    f.savefig(str(name)+'_boxplot.png')
    # open the on screen print of the figure
    plt.ion()
    

save_boxplot(machine1, 'machine1')
save_boxplot(machine2, 'machine2')
save_boxplot(machine3, 'machine3')
    




save_seriesplot(machine1_frequency_1, 'machine1_frequency_1')
save_seriesplot(machine1_frequency_2, 'machine1_frequency_2')
save_seriesplot(machine1_frequency_3, 'machine1_frequency_3')

save_seriesplot(machine2, 'machine2')
save_seriesplot(machine3, 'machine3')




import os
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt


wd = os.getcwd()

data_dir = '/data_20180627/my_871uptransmitterminutedata2018-06-27.csv'
# obtain the original data
ori_data = pd.read_csv(wd + data_dir, header=None, sep='\t')
# split the string to measurement data
split_data = ori_data.iloc[:, 8].str.split(':', expand=True)
# 删除第一列，因为是空列
del split_data[0]
# give name to those columns
# 给各列命名
split_data.columns = ['mode', 'frequency', 'preset_power', 'mdlt_md', 'op_st',
                    'laV1', 'lg1V2', 'lg2V2', 'laV2', 'VaV2',
                    'APD', 'VSWR', 'Pfwd', 'Vfil1', 'VaV1',
                    'Vfil2', 'Vg1V2', 'Vg2V2', 'water', 'MP01',
                    'MP13', 'MP02', 'MP03', 'MP04', 'MP05',
                    'MP06', 'MP07', 'MP08', 'MP09', 'MP10',
                    'MP11', 'MP12', 'MS22', 'MS23', 'MS24',
                    'MS25', 'MS27','CPFD', 'CPRV', 'CPH1',
                    'CPH2', 'RFAT', 'GNOM', 'PREEQ', 'TAF',
                    'TOL', 'TAO', 'TNOM', 'TP', 'TTCS',
                    'TIS', 'TSF', 'TFS', 'TFH', 'TBHH',
                    'switch_comb']
len(split_data.columns)

# for some '57' like entries, can be transferred to numeric by 'to_numeric' function
# 部分列可以直接转换为numeric模式
for item in ['mode', 'frequency', 'preset_power', 'mdlt_md', 'op_st',
                    'laV1', 'lg1V2', 'lg2V2', 'laV2', 'VaV2',
                    'APD', 'VSWR', 'Pfwd', 'Vfil1', 'VaV1',
                    'Vfil2', 'Vg1V2', 'Vg2V2', 'water', 'MP01',
                    'MP13', 'MP02', 'MP03', 'MP04', 'MP05',
                    'MP06', 'MP07', 'MP08', 'MP09', 'MP10',
                    'MP11', 'MP12',
                                    'CPFD', 'CPRV', 'CPH1',
                    'CPH2', 'RFAT', 'GNOM',        'TAF',
                    'TOL', 'TAO', 'TNOM', 'TP', 'TTCS',
                    'TIS', 'TSF', 'TFS', 'TFH', 'TBHH',
                    'switch_comb']:
    split_data[item] = pd.to_numeric(split_data[item], errors='coerce').fillna(0)


# convert ON OFF to 0/1 variables
for item in ['MS22', 'MS23', 'MS24', 'MS25', 'MS27', 'PREEQ']:
    split_data[item] = (split_data[item] == 'ON').astype(int)
split_data['index'] = split_data.index


other_data = pd.DataFrame({'code': ori_data.ix[:, 1], 'time': ori_data.ix[:, 2]})
other_data['index'] = other_data.index

# merge to obtain a new dataframe
new_data = pd.merge(other_data, split_data, on='index')
new_data.columns
new_data.dtypes
new_data = new_data.sort_values(by='time')
new_data.to_csv('new_data.csv')

new_data = pd.read_csv('new_data.csv')
# change the time string to 'datetime' format
new_data['time'] = pd.to_datetime(new_data['time'])
new_data = new_data.set_index('time', drop= False)
# separate the newdata based on time index
new_20170725 = new_data['2017-07-24':'2017-07-26']
new_20170805 = new_data['2017-08-04':'2017-08-06']
new_20170815 = new_data['2017-08-14':'2017-08-17']

# check the total length
len(new_20170725) + len(new_20170805) + len(new_20170815)
len(new_data)

# 找到高压挂起的时间点
# Find the high V pending time using TTCS
new_20170725['TTCS'].value_counts()
new_20170805['TTCS'].value_counts()
new_20170815['TTCS'].value_counts()

new_20170815[new_20170815['TTCS'] == 1]

# 找到粗调的时间点
# 1:normal 2:coarse 3:fine 4: VaRising
new_20170725['TFS'].value_counts()
new_20170805['TFS'].value_counts()
new_20170815['TFS'].value_counts()

tmp = new_20170725[new_20170725['TFS'] == 2.0]
new_20170725[new_20170725['TFS'] == 3.0]
new_20170725[new_20170725['TFS'] == 4.0]



# 画图
# use boxplot to show the data
def save_seriesplot(df, name):
    # close the in screen print of the figure
    plt.ioff()
    # define a figure
    f, axs = plt.subplots(56, 1, figsize=(80, 80))
    axs = axs.ravel() # use ravel to make the tuple flat
    for i in range(0, 56):
        axs[i].plot(df.iloc[:, i + 3], linewidth=0.5)
        axs[i].set_title(df.columns[i + 3])
    plt.subplots_adjust(hspace=1)
    f.savefig(str(name)+'_seriesplot.png')
    # open the on screen print of the figure
    plt.ion()


# use boxplot to show the data
def save_seriesplot_noindex(df, name):
    # close the in screen print of the figure
    plt.ioff()
    # define a figure
    f, axs = plt.subplots(56, 1, figsize=(80, 80))
    axs = axs.ravel() # use ravel to make the tuple flat
    if drop == False:
        for i in range(0, 56):
            axs[i].plot(df.iloc[:, i + 3], linewidth=0.5)
            axs[i].set_title(df.columns[i + 3])
    else:
        for i in range(0, 56):
            axs[i].plot(df.iloc[:, i + 3].reset_index(drop=True), linewidth=0.5)
            axs[i].set_title(df.columns[i + 3])
    plt.subplots_adjust(hspace=1)
    f.savefig(str(name)+'_seriesplot.png')
    # open the on screen print of the figure
    plt.ion()


new_20170725.iloc[:, 0+3].reset_index(drop=False)
f = plt
f.plot(new_20170725.iloc[:, 0+3].reset_index(drop=False))

save_seriesplot(new_20170725, 'new_20170725')
save_seriesplot(new_20170805, 'new_20170805')
save_seriesplot(new_20170815, 'new_20170815')




# remove all zero data as possible
# get the on working data
new_20170725_nozero = new_20170725[(new_20170725['laV1'] > 0.5) &
                                   (new_20170725['op_st'] == 112)&
                                   (new_20170725['laV2'] > 0.5)]

new_20170805_nozero = new_20170805[(new_20170805['laV1'] > 0.5) &
                                   (new_20170805['op_st'] == 112)&
                                   (new_20170805['laV2'] > 0.5)]

new_20170815_nozero = new_20170815[(new_20170815['laV1'] > 0.5) &
                                   (new_20170815['op_st'] == 112)&
                                   (new_20178025['laV2'] > 0.5)]


save_seriesplot(new_20170725_nozero, 'new_20170725_nozero', True)
save_seriesplot(new_20170805_nozero, 'new_20170805_nozero', True)
save_seriesplot(new_20170815_nozero, 'new_20170815_nozero', True)




# all data sum up for frequency 
import pandas as pd
import matplotlib.pyplot as plt

# read the data
new_data = pd.read_csv('new_data.csv')
machine1 = pd.read_csv('machine1.csv')
machine2 = pd.read_csv('machine2.csv')
machine3 = pd.read_csv('machine3.csv')

# remove some columns
del new_data['Unnamed: 0']
del new_data['index']
del machine1['Unnamed: 0'], machine1['TAH'], machine1['index']
del machine2['Unnamed: 0'], machine2['TAH'], machine2['index']
del machine3['Unnamed: 0'], machine3['TAH'], machine3['index']

# check
len(new_data.ix[0, :])
len(machine1.ix[0, :])
len(machine2.ix[0, :])
len(machine3.ix[0, :])

# merge, 'axis=0' means join by rows
all_data = pd.concat([new_data, machine1, machine2, machine3], axis=0, join='outer')
# check
len(all_data) == len(machine1) + len(machine2) +len(machine3) +len(new_data)
# see the counts of different frequency
tmp_counts = all_data['frequency'].value_counts()
tmp_counts

# divide by frequency
# based on THAMES Transmitter Technique Specifications
interval = [[5900, 6295],   [7100, 7600],   [9400, 9900],
            [11500, 12175], [13570, 13870], [15030, 15800],
            [17480, 17900], [18900, 19020], [21450, 21850],
            [25670, 26100]]

# define some frequency series
freq = [None]*10
# sort the data based on the frequency interval
for i in range(0, 10):
    freq[i] = all_data[(all_data['frequency'] >= interval[i][0]) &
                       (all_data['frequency'] <= interval[i][1])]

# see the length of each frequency
sum = 0
for i in range(0, 10):
    print(len(freq[i]))
    sum += len(freq[i])

len(all_data)-sum
tmp = all_data[(all_data['frequency'] == 9370)]
len(tmp)


del f
2018-06-26 code

推荐阅读更多精彩内容