Code_자동차 연비예측_케라스편

케라스(Keras) 코드

앞서 요약했던 DNN과 관련하여 모델을 만들고 예측하는 코드다.

데이터는 자동차 연비와 관련되어 있으며 이를 예측하는 모델과 실제값을 비교하는 내용이다.

import pathlib

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

print(tf.__version__)
2.3.0

데이터셋

  • 1970년대 후반과 1980년대 초반의 자동차 연비를 예측하는 모델
  • 다운받고 열 지정
dataset_path = keras.utils.get_file("auto-mpg.data", "http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data")
dataset_path
'C:\\Users\\uos\\.keras\\datasets\\auto-mpg.data'
column_names = ['MPG','Cylinders','Displacement','Horsepower','Weight',
                'Acceleration', 'Model Year', 'Origin']
raw_dataset = pd.read_csv(dataset_path, names=column_names,
                      na_values = "?", comment='\t',
                      sep=" ", skipinitialspace=True)

dataset = raw_dataset.copy()
dataset.tail()
MPG Cylinders Displacement Horsepower Weight Acceleration Model Year Origin
393 27.0 4 140.0 86.0 2790.0 15.6 82 1
394 44.0 4 97.0 52.0 2130.0 24.6 82 2
395 32.0 4 135.0 84.0 2295.0 11.6 82 1
396 28.0 4 120.0 79.0 2625.0 18.6 82 1
397 31.0 4 119.0 82.0 2720.0 19.4 82 1

데이터 정제

dataset.isna().sum()
MPG             0
Cylinders       0
Displacement    0
Horsepower      6
Weight          0
Acceleration    0
Model Year      0
Origin          0
dtype: int64
#결측치 데이터
dataset[dataset['Horsepower'].isnull()]

MPG Cylinders Displacement Horsepower Weight Acceleration Model Year Origin
32 25.0 4 98.0 NaN 2046.0 19.0 71 1
126 21.0 6 200.0 NaN 2875.0 17.0 74 1
330 40.9 4 85.0 NaN 1835.0 17.3 80 2
336 23.6 4 140.0 NaN 2905.0 14.3 80 1
354 34.5 4 100.0 NaN 2320.0 15.8 81 2
374 23.0 4 151.0 NaN 3035.0 20.5 82 1
#결측치 평균으로 대체
dataset=dataset.fillna(dataset['Horsepower'].mean())
dataset[330:337]
MPG Cylinders Displacement Horsepower Weight Acceleration Model Year Origin
330 40.9 4 85.0 104.469388 1835.0 17.3 80 2
331 33.8 4 97.0 67.000000 2145.0 18.0 80 3
332 29.8 4 89.0 62.000000 1845.0 15.3 80 2
333 32.7 6 168.0 132.000000 2910.0 11.4 80 3
334 23.7 3 70.0 100.000000 2420.0 12.5 80 3
335 35.0 4 122.0 88.000000 2500.0 15.1 80 2
336 23.6 4 140.0 104.469388 2905.0 14.3 80 1

원핫인코딩

  • origin (1,2,3) : 범주형
dataset.groupby('Origin').count()
MPG Cylinders Displacement Horsepower Weight Acceleration Model Year
Origin
1 249 249 249 249 249 249 249
2 70 70 70 70 70 70 70
3 79 79 79 79 79 79 79
origin = dataset.pop('Origin')  # 레이블로 받음
dataset['USA'] = (origin == 1)*1.0
dataset['Europe'] = (origin == 2)*1.0
dataset['Japan'] = (origin == 3)*1.0
dataset.tail()
MPG Cylinders Displacement Horsepower Weight Acceleration Model Year USA Europe Japan
393 27.0 4 140.0 86.0 2790.0 15.6 82 1.0 0.0 0.0
394 44.0 4 97.0 52.0 2130.0 24.6 82 0.0 1.0 0.0
395 32.0 4 135.0 84.0 2295.0 11.6 82 1.0 0.0 0.0
396 28.0 4 120.0 79.0 2625.0 18.6 82 1.0 0.0 0.0
397 31.0 4 119.0 82.0 2720.0 19.4 82 1.0 0.0 0.0

데이터 셋 분할

train_dataset = dataset.sample(frac=0.8,random_state=0)
test_dataset = dataset.drop(train_dataset.index)

시각화

sns.pairplot(train_dataset[["MPG", "Cylinders", "Displacement", "Weight"]], diag_kind="kde")
<seaborn.axisgrid.PairGrid at 0x18a81e1a490>

output_15_1

train_stats = train_dataset.describe()
train_stats
MPG Cylinders Displacement Horsepower Weight Acceleration Model Year USA Europe Japan
count 318.000000 318.000000 318.000000 318.000000 318.000000 318.000000 318.000000 318.000000 318.000000 318.000000
mean 23.590566 5.427673 193.061321 104.073418 2963.823899 15.595912 75.946541 0.641509 0.163522 0.194969
std 7.913617 1.682941 103.812742 38.368477 844.749805 2.796282 3.705266 0.480313 0.370424 0.396801
min 10.000000 3.000000 70.000000 46.000000 1613.000000 8.000000 70.000000 0.000000 0.000000 0.000000
25% 17.125000 4.000000 100.250000 75.250000 2219.250000 13.900000 73.000000 0.000000 0.000000 0.000000
50% 22.750000 4.000000 151.000000 92.000000 2792.500000 15.500000 76.000000 1.000000 0.000000 0.000000
75% 29.000000 6.000000 259.500000 120.000000 3571.250000 17.300000 79.000000 1.000000 0.000000 0.000000
max 46.600000 8.000000 455.000000 230.000000 5140.000000 24.800000 82.000000 1.000000 1.000000 1.000000
train_stats.pop("MPG")  # mpg(자동차 연비: y)
train_stats = train_stats.transpose()
train_stats  # 값의 범위가 크기때문에 정규화 진행 필요
count mean std min 25% 50% 75% max
Cylinders 318.0 5.427673 1.682941 3.0 4.00 4.0 6.00 8.0
Displacement 318.0 193.061321 103.812742 70.0 100.25 151.0 259.50 455.0
Horsepower 318.0 104.073418 38.368477 46.0 75.25 92.0 120.00 230.0
Weight 318.0 2963.823899 844.749805 1613.0 2219.25 2792.5 3571.25 5140.0
Acceleration 318.0 15.595912 2.796282 8.0 13.90 15.5 17.30 24.8
Model Year 318.0 75.946541 3.705266 70.0 73.00 76.0 79.00 82.0
USA 318.0 0.641509 0.480313 0.0 0.00 1.0 1.00 1.0
Europe 318.0 0.163522 0.370424 0.0 0.00 0.0 0.00 1.0
Japan 318.0 0.194969 0.396801 0.0 0.00 0.0 0.00 1.0

특성과 레이블 분리

train_labels = train_dataset.pop('MPG')
test_labels = test_dataset.pop('MPG')

정규화

def norm(x):
  return (x - train_stats['mean']) / train_stats['std']
normed_train_data = norm(train_dataset)
normed_test_data = norm(test_dataset)

모델 만들기

  • densely connected 2개의 층
def build_model():
    model =keras.Sequential([
        layers.Dense(64,activation='relu',input_shape=[len(train_dataset.keys())]),
        layers.Dense(64, activation='relu'),
        layers.Dense(1)
    ])

    optimizer=tf.keras.optimizers.Adam(0.001) 
    
    model.compile(loss='mse', optimizer=optimizer,metrics=['mae','mse']) # 분류일때,'accuracy'
    
    return model
model=build_model()
model.summary()
Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense_6 (Dense)              (None, 64)                640       
_________________________________________________________________
dense_7 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 65        
=================================================================
Total params: 4,865
Trainable params: 4,865
Non-trainable params: 0
_________________________________________________________________
example_batch = normed_train_data[:10]
example_result = model.predict(example_batch)
example_result
array([[ 0.0416676 ],
       [-0.03854847],
       [-0.11580162],
       [-0.17297746],
       [ 0.00868107],
       [-0.3099752 ],
       [-0.3315743 ],
       [ 0.02203141],
       [ 0.15218496],
       [ 0.0900453 ]], dtype=float32)

모델 훈련

  • 1000번의 epoch동안 훈련
# 에포크가 끝날 때마다 점(.)을 출력해 훈련 진행 과정을 표시합니다
class PrintDot(keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs):
    if epoch % 100 == 0: print('')
    print('.', end='')

EPOCHS = 1000

history = model.fit(
  normed_train_data, train_labels,
  epochs=EPOCHS, validation_split = 0.2, verbose=0,
  callbacks=[PrintDot()])
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
hist.tail()
loss mae mse val_loss val_mae val_mse epoch
995 2.358095 1.010328 2.358095 9.766920 2.035713 9.766920 995
996 2.424612 1.034429 2.424612 9.920653 2.076196 9.920653 996
997 2.260907 1.004899 2.260907 9.677170 2.016382 9.677170 997
998 2.379298 1.019345 2.379298 9.484661 2.004562 9.484661 998
999 2.322387 0.990800 2.322387 9.923290 2.072285 9.923290 999
import matplotlib.pyplot as plt

def plot_history(history):
  hist = pd.DataFrame(history.history)
  hist['epoch'] = history.epoch

  plt.figure(figsize=(8,12))

  plt.subplot(2,1,1)
  plt.xlabel('Epoch')
  plt.ylabel('Mean Abs Error [MPG]')
  plt.plot(hist['epoch'], hist['mae'],
           label='Train Error')
  plt.plot(hist['epoch'], hist['val_mae'],
           label = 'Val Error')
  plt.ylim([0,5])
  plt.legend()

  plt.subplot(2,1,2)
  plt.xlabel('Epoch')
  plt.ylabel('Mean Square Error [$MPG^2$]')
  plt.plot(hist['epoch'], hist['mse'],
           label='Train Error')
  plt.plot(hist['epoch'], hist['val_mse'],
           label = 'Val Error')
  plt.ylim([0,20])
  plt.legend()
  plt.show()

plot_history(history)

output_30_0

train과 val error의 차이가 많이 나서 지정된 에포크 횟수동안 성능향상이 없으면 자동으로 훈련을 멈춤

model = build_model()

# patience 매개변수는 성능 향상을 체크할 에포크 횟수입니다
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)

history = model.fit(normed_train_data, train_labels, epochs=EPOCHS,
                    validation_split = 0.2, verbose=0, callbacks=[early_stop, PrintDot()])

plot_history(history)
....................................................................................................
...........................................................

output_32_1

테스트세트로 평가

loss, mae, mse = model.evaluate(normed_test_data, test_labels, verbose=2)

print("테스트 세트의 평균 절대 오차: {:5.2f} MPG".format(mae))
3/3 - 0s - loss: 6.5907 - mae: 1.9264 - mse: 6.5907
테스트 세트의 평균 절대 오차:  1.93 MPG

예측

  • 테스트 세트(실제 y)에서 MPG값 예측 비교
test_predictions = model.predict(normed_test_data).flatten()

plt.scatter(test_labels, test_predictions)
plt.xlabel('True Values [MPG]')
plt.ylabel('Predictions [MPG]')
plt.axis('equal')
plt.axis('square')
plt.xlim([0,plt.xlim()[1]])
plt.ylim([0,plt.ylim()[1]])
_ = plt.plot([-100, 100], [-100, 100])

output_36_0

error = test_predictions - test_labels
plt.hist(error, bins = 25)
plt.xlabel("Prediction Error [MPG]")
_ = plt.ylabel("Count")

output_37_0

참고: TensorFlow 자동차 연비 예측하기 : 회귀

댓글남기기