Tryag File Manager

//paip/script/weight/PredictWeightTrend.py

'''
@date : 2022-03-25
@author: 전규빈
@content : 무게 트렌드 생성
'''

import os.path
import sys
import time

import pandas as pd
import numpy as np
import glob, warnings
from datetime import datetime
import matplotlib as mpl
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
import warnings
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import explained_variance_score
import joblib
import xgboost
import pickle
from sklearn.pipeline import Pipeline
from datetime import datetime
from apscheduler.schedulers.background import BackgroundScheduler
from apscheduler.jobstores.base import JobLookupError

HOME_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir)
DATA_DIR = 'data'
MODULE_DIR = 'util'
OUTPUT_DIR = 'out'

sys.path.append(os.path.join(HOME_PATH, MODULE_DIR))
from PyDBconnector import PyDBconnector

warnings.filterwarnings('ignore')

class WeightPredict():
    instance = None

@classmethod
    def _getInstance(cls):
        return cls._instance

@classmethod
    def instance(cls, *args, **kargs):
        cls._instance = cls(*args, **kargs)
        cls.instance = cls._getInstance
        return cls._instance

def __init__(self):
        self._dbConn = PyDBconnector('192.100.0.11')
        self.load_data()

def change_raw_pixel(self, dataframe):

def remove_str(x):
            x = x.replace('[', "")
            x = x.replace(']', "")
            arr_list = [int(str) for str in x.split(",") if x !=""]
            # arr_list = list(map(int, x.split(",")))

# result = round(sum(arr_list) / len(arr_list),1) # 중앙값
            return arr_list

def using_repeat(df, column_name):
            df_column_list = df.columns.tolist()
            df_column_list.remove(column_name)
            lens = [len(item) for item in df[column_name]]
            print(df_column_list)
            return pd.DataFrame({df_column_list[0]: np.repeat(df[df_column_list[0]].values, lens),
                                 df_column_list[1]: np.repeat(df[df_column_list[1]].values, lens),
                                 df_column_list[2]: np.repeat(df[df_column_list[2]].values, lens),
                                 df_column_list[3]: np.repeat(df[df_column_list[3]].values, lens),
                                 df_column_list[4]: np.repeat(df[df_column_list[4]].values, lens),
                                 df_column_list[5]: np.repeat(df[df_column_list[5]].values, lens),
                                 #                           df_column_list[6] :np.repeat(df[df_column_list[6]].values,lens),
                                 #                           df_column_list[7] :np.repeat(df[df_column_list[7]].values,lens),
                                 column_name: np.concatenate(df[column_name].values)})

dataframe['pixel'] = dataframe['WEIGHT_PREDICTION_PIXEL_MEAN'].apply(remove_str)
        result_df_raw= using_repeat(dataframe, 'pixel')

return result_df_raw

# DBSCAN
    def dbscan(self, dataframe):
        '''
        :param dataframe: 하루치 씩 데이터프레임
        :param house_id: 축사번호
        :param year_param: 년도
        :return: cluster 구분 데이터 프레임
        '''

scaler = StandardScaler()
        df_scale = pd.DataFrame(scaler.fit_transform(pd.DataFrame(dataframe['pixel'])))
        model = DBSCAN(eps=1.2, min_samples=100).fit(df_scale)
        model.fit(df_scale)
        pred = model.fit_predict(df_scale)
        dataframe['cluster'] = pred

return dataframe

def tukey_outlier(self, dataframe):

Q1 = np.percentile(dataframe['pixel'], 35)
        Q3 = np.percentile(dataframe['pixel'], 95)
        IQR = Q3 - Q1

lower = Q1 - 1.5 * IQR
        upper = Q1 + 1.5 * IQR

condition = (dataframe['pixel'] >= lower) & (dataframe['pixel'] <= upper)

dataframe['cluster'] = condition

return dataframe

# 이상치 구분 및 제거 (tukey_outlier 적용)
    def outlier_remove(self, dataframe, plot=False):
        '''
        :param dataframe: raw 픽셀 데이터프레임
        :return: 이상치 판단 후 제거한 데이터프레임
        '''

result_df = pd.DataFrame()
        house_id = dataframe['HOUSE_ID'][0]

start_date = dataframe['CREATE_TIME'].head(1)[0].strftime('%Y-%m-%d')
        end_date = dataframe['CREATE_TIME'].tail(1)[len(dataframe) - 1].strftime('%Y-%m-%d')

date_range = pd.date_range(start=start_date, end=end_date)

# 이상치 구분
        for i in range(0, len(date_range)):
            if i == len(date_range)-1:
                condition = (dataframe['CREATE_TIME'] >= date_range[i])
            else :
                condition = (dataframe['CREATE_TIME'] >= date_range[i]) & (dataframe['CREATE_TIME'] <= date_range[i + 1])
            # result = self.dbscan(dataframe[condition])
            result = self.tukey_outlier(dataframe[condition])
            result_df = pd.concat((result_df, result), sort=False)

condition = (result_df["cluster"] == True)
        if plot:
            plt.figure(figsize=(20, 10))
            plt.title(f' Raw Pixel Outlier ')
            plt.scatter(x=result_df[condition]['CREATE_TIME'], y=result_df[condition]['pixel'], color='blue',
                        label='normal')
            plt.scatter(x=result_df[~(condition)]['CREATE_TIME'], y=result_df[~(condition)]['pixel'], color='red',
                        label='outlier')
            plt.legend(loc='best', ncol=3)
            plt.show()

outlier_remove_raw = result_df[condition]

# Pixel Median
        medianPixel = outlier_remove_raw.groupby(pd.Grouper(key='CREATE_TIME', freq='6h')).median()['pixel']
        medianPixel = medianPixel.reset_index()

return outlier_remove_raw, medianPixel

def model_train(self, dataframe_pixel, train_data):
        '''

:param dataframe_pixel: Image raw pixel
        :param dataframe_weight: medianPixel, medianWeight
        :return: 모델 생성
        '''

x_train = pd.DataFrame(train_data['pixel'])
        y_train = train_data['medianWeight']

# xgb_model = xgboost.XGBRegressor(booster='gblinear', n_estimators=100, learning_rate=0.08, gamma=0,
        #                                  subsample=0.75, colsample_bytree=1, max_depth=6)
        # xgb_model = xgboost.XGBRegressor(booster='gblinear', n_estimators=600, learning_rate=0.5, gamma=0, subsample=1,
        #                                  colsample_bytree=1, max_depth=6)
        xgb_model = xgboost.XGBRegressor(booster='gblinear', n_estimators=1500, learning_rate=0.5, gamma=0, subsample=1, eta=1, mild_child_weight=0.3, colsample_bytree=1, max_depth=0, labmda=0)

poly_features = PolynomialFeatures(degree=2, include_bias=False)
        A_poly_train = poly_features.fit_transform(x_train)

xgb_model.fit(A_poly_train, y_train)

#r_sq = xgb_model.score(A_poly_train, y_train)

#dataframe_pixel.rename(columns={'pixel': 'medianPixel'}, inplace=True)
        A_poly_test = poly_features.fit_transform(pd.DataFrame(dataframe_pixel['pixel']).astype('int64'))
        y_pred = xgb_model.predict(A_poly_test)

dataframe_pixel['PredictWeight'] = y_pred

# xgboost 예측 값 6hour 그룹화
        dataframe_pixel_grouped = dataframe_pixel.groupby(pd.Grouper(key='CREATE_TIME', freq='6h')).mean()

# 결측값 보정
        dataframe_pixel_grouped = dataframe_pixel_grouped.interpolate()
        # print(dataframe_pixel_grouped)
        # print(dataframe_pixel_grouped.isnull().sum())
        dataframe_pixel_grouped_index = dataframe_pixel_grouped.reset_index()

# 평균 회귀 적용
        x_train = pd.DataFrame(dataframe_pixel_grouped_index.index)
        y_train = dataframe_pixel_grouped['PredictWeight']

poly_features = PolynomialFeatures(degree=2, include_bias=False)
        A_poly_train = poly_features.fit_transform(x_train)
        A_poly_test = poly_features.fit_transform(pd.DataFrame(dataframe_pixel_grouped_index.index))

lin_reg = LinearRegression()
        lin_reg.fit(A_poly_train, y_train)

y_pred = lin_reg.predict(A_poly_test)
        dataframe_pixel_grouped['linear predict'] = y_pred

return dataframe_pixel_grouped

def load_data(self):
        total_start_time = time.time()

# 업데이트 날짜
        update_date = datetime.today().strftime('%Y-%m-%d %H:%M:%S')
        update_date = pd.to_datetime(update_date)

# print(update_date)
        # update_date_input = '2021-12-25'
        # update_date = pd.to_datetime(update_date_input)

# 일령 데이터
        dayAge_sql_str = f"select HOUSE_ID, IN_DATE, OUT_DATE, DAYS_AFTER_BIRTH from tbl_house_breed_hist ORDER BY IN_DATE , HOUSE_ID"
        house_breed_hist_total = self._dbConn.select_from_db(dayAge_sql_str)

# 업데이트 된 날짜의 파스만
        # house_breed_hist = house_breed_hist_total[(update_date >= house_breed_hist_total['IN_DATE']) & (update_date <= house_breed_hist_total['OUT_DATE'])]
        house_breed_hist = house_breed_hist_total[(update_date >= house_breed_hist_total['IN_DATE']) & (update_date <= house_breed_hist_total['OUT_DATE'])]

no_out_date = False
        # 일령 DB 조회가 안될때
        if len(house_breed_hist) == 0:
            no_out_date = False
            house_breed_hist = house_breed_hist_total[house_breed_hist_total['OUT_DATE'].isnull()]

# 입식 날짜는 있는데 출하 날짜가 없는 경우
            if house_breed_hist.iloc[0]['IN_DATE'] <= update_date :
                no_out_date = True
            # 해당하는 일령이 없을때
            else :
                raise Exception("Not found tbl_house_breed_hist data")

#no_out_date = True
            #else :
             #   raise Exception("Not found tbl_house_breed_hist data")

print(house_breed_hist)
        house_id_list = sorted(set(house_breed_hist['HOUSE_ID']))

for house_idx in house_id_list:
            start_time = time.time()
            # if house_idx == 'H01':
                # continue
            print(f"-------------------------------{house_idx} -------------------------------")

# print(house_breed_hist[house_breed_hist['HOUSE_ID'] == house_idx]['IN_DATE'].iloc[0])

if no_out_date:
                # Load (pixel data and weight Data)
                print(house_breed_hist[house_breed_hist['HOUSE_ID'] == house_idx]['IN_DATE'].iloc[0])
                pixel_sql_str = f"select CREATE_TIME, HOUSE_ID, MODULE_ID, WEIGHT_PREDICTION_PIXEL_MEAN from tbl_image_analysis_weight where MODULE_ID LIKE 'CT%' and HOUSE_ID = '{house_idx}' and CREATE_TIME >= '{house_breed_hist[house_breed_hist['HOUSE_ID'] == house_idx]['IN_DATE'].iloc[0]}'"
                weight_sql_str = f"select CREATE_TIME, medianWeight, medianPixel, HOUSE_ID from tbl_weight_stats where HOUSE_ID = '{house_idx}'and CREATE_TIME >= '{house_breed_hist[house_breed_hist['HOUSE_ID'] == house_idx]['IN_DATE'].iloc[0]}'"
            else :
                pixel_sql_str = f"select CREATE_TIME, HOUSE_ID, MODULE_ID, WEIGHT_PREDICTION_PIXEL_MEAN from tbl_image_analysis_weight where HOUSE_ID = '{house_idx}' and CREATE_TIME between '{house_breed_hist[house_breed_hist['HOUSE_ID'] == house_idx]['IN_DATE'].iloc[0]}' and '{house_breed_hist[house_breed_hist['HOUSE_ID'] == house_idx]['OUT_DATE'].iloc[0]}'"
                weight_sql_str = f"select CREATE_TIME, medianWeight, medianPixel, HOUSE_ID from tbl_weight_stats where HOUSE_ID = '{house_idx}'and CREATE_TIME between '{house_breed_hist[house_breed_hist['HOUSE_ID'] == house_idx]['IN_DATE'].iloc[0]}' and '{house_breed_hist[house_breed_hist['HOUSE_ID'] == house_idx]['OUT_DATE'].iloc[0]}'"

pixel_db_data = self._dbConn.select_from_db(pixel_sql_str)
            pixel_db_data = pixel_db_data.sort_values(by=["CREATE_TIME"], ascending=[True]).reset_index(drop=True)
            pixel_db_data = pixel_db_data[pixel_db_data['WEIGHT_PREDICTION_PIXEL_MEAN'].notnull()]

weight_db_data = self._dbConn.select_from_db(weight_sql_str)

if len(pixel_db_data) == 0 :
                print(f"{house_idx} No Pixel data")
                continue

if len(weight_db_data) == 0 :
                print(f"{house_idx} No Weight data")
                continue

farm_pixel = pixel_db_data[pixel_db_data['HOUSE_ID'] == house_idx].reset_index(drop=True)
            farm_weight = weight_db_data[weight_db_data['HOUSE_ID'] == house_idx].reset_index(drop=True)
            farm_weight['CREATE_TIME'] = pd.to_datetime(farm_weight['CREATE_TIME'], format='%Y-%m-%d %H:%M:%S')
            # print(farm_weight['CREATE_TIME'].iloc[0])

# house_cycle = house_breed_hist[house_breed_hist['HOUSE_ID'] == house_idx]
            house_cycle = house_breed_hist_total[house_breed_hist_total['HOUSE_ID'] == house_idx].reset_index(drop=True)
            date = farm_pixel['CREATE_TIME'].iloc[0]

# cycle 계산
            if no_out_date :
                cycle = house_cycle[house_cycle.isnull()].index[0] + 1
            else :
                cycle = house_cycle[(date >= house_cycle['IN_DATE']) & (date <= house_cycle['OUT_DATE'])].index[0] + 1

self.start_date = farm_pixel['CREATE_TIME'].head(1)[0].strftime('%Y-%m-%d')
            self.end_date = farm_pixel['CREATE_TIME'].tail(1)[len(farm_pixel) - 1].strftime('%Y-%m-%d')

chicken_age_data = house_breed_hist[(date >= house_breed_hist['IN_DATE'])]

#chicken_age = house_breed_hist[house_breed_hist['HOUSE_ID'] == house_id_list[i]].iloc[0]['IN_DATE'].strftime('%Y-%m-%d %H:%M:%S')
            chicken_age = chicken_age_data.iloc[0]['IN_DATE'].strftime('%Y-%m-%d %H:%M:%S')
            chicken_age = datetime.strptime(chicken_age, '%Y-%m-%d %H:%M:%S')

raw_result = self.change_raw_pixel(farm_pixel)
            outlier_remove_raw, medianPixel = self.outlier_remove(raw_result)

train_data = pd.merge(left=medianPixel, right=farm_weight, how='inner', on='CREATE_TIME')
            print("----------------Train data----------------")
            print(train_data)
            print("------------------------------------------")

result = self.model_train(outlier_remove_raw, train_data)
            result['HOUSE_ID'] = house_idx
            result['cycle'] = cycle
            # print(result)
            result = result.reset_index()

result.loc[result['HOUSE_ID'] == house_idx, 'dayAge'] = [round((datetime.strptime(x.strftime('%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S') - chicken_age).days + ((datetime.strptime(x.strftime('%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S') - chicken_age).seconds/3600)/24, 2) + house_breed_hist[house_breed_hist['HOUSE_ID']==house_idx]['DAYS_AFTER_BIRTH'] for x in result[result['HOUSE_ID'] == house_idx]['CREATE_TIME']]

print(result[['CREATE_TIME', 'HOUSE_ID', 'linear predict', 'dayAge', 'cycle']])

# self.upload_db(result)

print("time :", time.time() - start_time)

print("------------------------------------------------------")

self.close_db()
        print("finish :", time.time() - total_start_time)

def upload_db(self, result):

# Insert
        for i in range(0, len(result)) :
            try :
                #insert_str = f"insert into tbl_farm_weight_trend (CREATE_TIME, FARM_ID, GATEWAY_ID, HOUSE_ID, WEIGHT, DAY_AGE) values('{result.loc[i]['CREATE_TIME']}', '{result.loc[i]['FARM_ID']}', '{result.loc[i]['GATEWAY_ID']}', '{result.loc[i]['HOUSE_ID']}', {result.loc[i]['linear predict']}, {result.loc[i]['dayAge']}) ON DUPLICATE KEY UPDATE CREATE_TIME='{result.loc[i]['CREATE_TIME']}', FARM_ID='{result.loc[i]['FARM_ID']}', GATEWAY_ID='{result.loc[i]['GATEWAY_ID']}', HOUSE_ID = '{result.loc[i]['HOUSE_ID']}', WEIGHT = {result.loc[i]['linear predict']}, DAY_AGE={result.loc[i]['dayAge']}"
                insert_str = f"insert into tbl_weight_trend (CREATE_TIME, HOUSE_ID, WEIGHT, DAY_AGE, CYCLE) values('{result.loc[i]['CREATE_TIME']}', '{result.loc[i]['HOUSE_ID']}', {result.loc[i]['linear predict']}, {result.loc[i]['dayAge']}, {result.loc[i]['cycle']}) ON DUPLICATE KEY UPDATE CREATE_TIME='{result.loc[i]['CREATE_TIME']}', HOUSE_ID = '{result.loc[i]['HOUSE_ID']}', WEIGHT = {result.loc[i]['linear predict']}, DAY_AGE={result.loc[i]['dayAge']}, CYCLE={result.loc[i]['cycle']}"
                self._dbConn.insert_to_db(insert_str)
                print("DB upload Success")
            except Exception as e :
                print(e)

def close_db(self):
        self._dbConn.close()

if __name__ == '__main__':
    '''
        drive code 
    '''

def job():
        print("Update Predict Weight every 6 hours")
        WeightPredict()

#sched = BackgroundScheduler()

#sched.add_job(job, 'cron', hour='00, 06, 12, 18', id="httpFileTransferToday_1")
    # sched.add_job(job, 'cron', minute='*', second='*/10', id="httpFileTransferToday_1")
    #sched.start()
    job()

while True:
        time.sleep(5)