Code_자동차 연비예측_케라스편
케라스(Keras) 코드
앞서 요약했던 DNN과 관련하여 모델을 만들고 예측하는 코드다.
데이터는 자동차 연비와 관련되어 있으며 이를 예측하는 모델과 실제값을 비교하는 내용이다.
import pathlib
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
print(tf.__version__)
2.3.0
데이터셋
- 1970년대 후반과 1980년대 초반의 자동차 연비를 예측하는 모델
- 다운받고 열 지정
dataset_path = keras.utils.get_file("auto-mpg.data", "http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data")
dataset_path
'C:\\Users\\uos\\.keras\\datasets\\auto-mpg.data'
column_names = ['MPG','Cylinders','Displacement','Horsepower','Weight',
'Acceleration', 'Model Year', 'Origin']
raw_dataset = pd.read_csv(dataset_path, names=column_names,
na_values = "?", comment='\t',
sep=" ", skipinitialspace=True)
dataset = raw_dataset.copy()
dataset.tail()
MPG | Cylinders | Displacement | Horsepower | Weight | Acceleration | Model Year | Origin | |
---|---|---|---|---|---|---|---|---|
393 | 27.0 | 4 | 140.0 | 86.0 | 2790.0 | 15.6 | 82 | 1 |
394 | 44.0 | 4 | 97.0 | 52.0 | 2130.0 | 24.6 | 82 | 2 |
395 | 32.0 | 4 | 135.0 | 84.0 | 2295.0 | 11.6 | 82 | 1 |
396 | 28.0 | 4 | 120.0 | 79.0 | 2625.0 | 18.6 | 82 | 1 |
397 | 31.0 | 4 | 119.0 | 82.0 | 2720.0 | 19.4 | 82 | 1 |
데이터 정제
dataset.isna().sum()
MPG 0
Cylinders 0
Displacement 0
Horsepower 6
Weight 0
Acceleration 0
Model Year 0
Origin 0
dtype: int64
#결측치 데이터
dataset[dataset['Horsepower'].isnull()]
MPG | Cylinders | Displacement | Horsepower | Weight | Acceleration | Model Year | Origin | |
---|---|---|---|---|---|---|---|---|
32 | 25.0 | 4 | 98.0 | NaN | 2046.0 | 19.0 | 71 | 1 |
126 | 21.0 | 6 | 200.0 | NaN | 2875.0 | 17.0 | 74 | 1 |
330 | 40.9 | 4 | 85.0 | NaN | 1835.0 | 17.3 | 80 | 2 |
336 | 23.6 | 4 | 140.0 | NaN | 2905.0 | 14.3 | 80 | 1 |
354 | 34.5 | 4 | 100.0 | NaN | 2320.0 | 15.8 | 81 | 2 |
374 | 23.0 | 4 | 151.0 | NaN | 3035.0 | 20.5 | 82 | 1 |
#결측치 평균으로 대체
dataset=dataset.fillna(dataset['Horsepower'].mean())
dataset[330:337]
MPG | Cylinders | Displacement | Horsepower | Weight | Acceleration | Model Year | Origin | |
---|---|---|---|---|---|---|---|---|
330 | 40.9 | 4 | 85.0 | 104.469388 | 1835.0 | 17.3 | 80 | 2 |
331 | 33.8 | 4 | 97.0 | 67.000000 | 2145.0 | 18.0 | 80 | 3 |
332 | 29.8 | 4 | 89.0 | 62.000000 | 1845.0 | 15.3 | 80 | 2 |
333 | 32.7 | 6 | 168.0 | 132.000000 | 2910.0 | 11.4 | 80 | 3 |
334 | 23.7 | 3 | 70.0 | 100.000000 | 2420.0 | 12.5 | 80 | 3 |
335 | 35.0 | 4 | 122.0 | 88.000000 | 2500.0 | 15.1 | 80 | 2 |
336 | 23.6 | 4 | 140.0 | 104.469388 | 2905.0 | 14.3 | 80 | 1 |
원핫인코딩
- origin (1,2,3) : 범주형
dataset.groupby('Origin').count()
MPG | Cylinders | Displacement | Horsepower | Weight | Acceleration | Model Year | |
---|---|---|---|---|---|---|---|
Origin | |||||||
1 | 249 | 249 | 249 | 249 | 249 | 249 | 249 |
2 | 70 | 70 | 70 | 70 | 70 | 70 | 70 |
3 | 79 | 79 | 79 | 79 | 79 | 79 | 79 |
origin = dataset.pop('Origin') # 레이블로 받음
dataset['USA'] = (origin == 1)*1.0
dataset['Europe'] = (origin == 2)*1.0
dataset['Japan'] = (origin == 3)*1.0
dataset.tail()
MPG | Cylinders | Displacement | Horsepower | Weight | Acceleration | Model Year | USA | Europe | Japan | |
---|---|---|---|---|---|---|---|---|---|---|
393 | 27.0 | 4 | 140.0 | 86.0 | 2790.0 | 15.6 | 82 | 1.0 | 0.0 | 0.0 |
394 | 44.0 | 4 | 97.0 | 52.0 | 2130.0 | 24.6 | 82 | 0.0 | 1.0 | 0.0 |
395 | 32.0 | 4 | 135.0 | 84.0 | 2295.0 | 11.6 | 82 | 1.0 | 0.0 | 0.0 |
396 | 28.0 | 4 | 120.0 | 79.0 | 2625.0 | 18.6 | 82 | 1.0 | 0.0 | 0.0 |
397 | 31.0 | 4 | 119.0 | 82.0 | 2720.0 | 19.4 | 82 | 1.0 | 0.0 | 0.0 |
데이터 셋 분할
train_dataset = dataset.sample(frac=0.8,random_state=0)
test_dataset = dataset.drop(train_dataset.index)
시각화
sns.pairplot(train_dataset[["MPG", "Cylinders", "Displacement", "Weight"]], diag_kind="kde")
<seaborn.axisgrid.PairGrid at 0x18a81e1a490>
train_stats = train_dataset.describe()
train_stats
MPG | Cylinders | Displacement | Horsepower | Weight | Acceleration | Model Year | USA | Europe | Japan | |
---|---|---|---|---|---|---|---|---|---|---|
count | 318.000000 | 318.000000 | 318.000000 | 318.000000 | 318.000000 | 318.000000 | 318.000000 | 318.000000 | 318.000000 | 318.000000 |
mean | 23.590566 | 5.427673 | 193.061321 | 104.073418 | 2963.823899 | 15.595912 | 75.946541 | 0.641509 | 0.163522 | 0.194969 |
std | 7.913617 | 1.682941 | 103.812742 | 38.368477 | 844.749805 | 2.796282 | 3.705266 | 0.480313 | 0.370424 | 0.396801 |
min | 10.000000 | 3.000000 | 70.000000 | 46.000000 | 1613.000000 | 8.000000 | 70.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 17.125000 | 4.000000 | 100.250000 | 75.250000 | 2219.250000 | 13.900000 | 73.000000 | 0.000000 | 0.000000 | 0.000000 |
50% | 22.750000 | 4.000000 | 151.000000 | 92.000000 | 2792.500000 | 15.500000 | 76.000000 | 1.000000 | 0.000000 | 0.000000 |
75% | 29.000000 | 6.000000 | 259.500000 | 120.000000 | 3571.250000 | 17.300000 | 79.000000 | 1.000000 | 0.000000 | 0.000000 |
max | 46.600000 | 8.000000 | 455.000000 | 230.000000 | 5140.000000 | 24.800000 | 82.000000 | 1.000000 | 1.000000 | 1.000000 |
train_stats.pop("MPG") # mpg(자동차 연비: y)
train_stats = train_stats.transpose()
train_stats # 값의 범위가 크기때문에 정규화 진행 필요
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
Cylinders | 318.0 | 5.427673 | 1.682941 | 3.0 | 4.00 | 4.0 | 6.00 | 8.0 |
Displacement | 318.0 | 193.061321 | 103.812742 | 70.0 | 100.25 | 151.0 | 259.50 | 455.0 |
Horsepower | 318.0 | 104.073418 | 38.368477 | 46.0 | 75.25 | 92.0 | 120.00 | 230.0 |
Weight | 318.0 | 2963.823899 | 844.749805 | 1613.0 | 2219.25 | 2792.5 | 3571.25 | 5140.0 |
Acceleration | 318.0 | 15.595912 | 2.796282 | 8.0 | 13.90 | 15.5 | 17.30 | 24.8 |
Model Year | 318.0 | 75.946541 | 3.705266 | 70.0 | 73.00 | 76.0 | 79.00 | 82.0 |
USA | 318.0 | 0.641509 | 0.480313 | 0.0 | 0.00 | 1.0 | 1.00 | 1.0 |
Europe | 318.0 | 0.163522 | 0.370424 | 0.0 | 0.00 | 0.0 | 0.00 | 1.0 |
Japan | 318.0 | 0.194969 | 0.396801 | 0.0 | 0.00 | 0.0 | 0.00 | 1.0 |
특성과 레이블 분리
train_labels = train_dataset.pop('MPG')
test_labels = test_dataset.pop('MPG')
정규화
def norm(x):
return (x - train_stats['mean']) / train_stats['std']
normed_train_data = norm(train_dataset)
normed_test_data = norm(test_dataset)
모델 만들기
- densely connected 2개의 층
def build_model():
model =keras.Sequential([
layers.Dense(64,activation='relu',input_shape=[len(train_dataset.keys())]),
layers.Dense(64, activation='relu'),
layers.Dense(1)
])
optimizer=tf.keras.optimizers.Adam(0.001)
model.compile(loss='mse', optimizer=optimizer,metrics=['mae','mse']) # 분류일때,'accuracy'
return model
model=build_model()
model.summary()
Model: "sequential_2"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
dense_6 (Dense) (None, 64) 640
_________________________________________________________________
dense_7 (Dense) (None, 64) 4160
_________________________________________________________________
dense_8 (Dense) (None, 1) 65
=================================================================
Total params: 4,865
Trainable params: 4,865
Non-trainable params: 0
_________________________________________________________________
example_batch = normed_train_data[:10]
example_result = model.predict(example_batch)
example_result
array([[ 0.0416676 ],
[-0.03854847],
[-0.11580162],
[-0.17297746],
[ 0.00868107],
[-0.3099752 ],
[-0.3315743 ],
[ 0.02203141],
[ 0.15218496],
[ 0.0900453 ]], dtype=float32)
모델 훈련
- 1000번의 epoch동안 훈련
# 에포크가 끝날 때마다 점(.)을 출력해 훈련 진행 과정을 표시합니다
class PrintDot(keras.callbacks.Callback):
def on_epoch_end(self, epoch, logs):
if epoch % 100 == 0: print('')
print('.', end='')
EPOCHS = 1000
history = model.fit(
normed_train_data, train_labels,
epochs=EPOCHS, validation_split = 0.2, verbose=0,
callbacks=[PrintDot()])
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
hist.tail()
loss | mae | mse | val_loss | val_mae | val_mse | epoch | |
---|---|---|---|---|---|---|---|
995 | 2.358095 | 1.010328 | 2.358095 | 9.766920 | 2.035713 | 9.766920 | 995 |
996 | 2.424612 | 1.034429 | 2.424612 | 9.920653 | 2.076196 | 9.920653 | 996 |
997 | 2.260907 | 1.004899 | 2.260907 | 9.677170 | 2.016382 | 9.677170 | 997 |
998 | 2.379298 | 1.019345 | 2.379298 | 9.484661 | 2.004562 | 9.484661 | 998 |
999 | 2.322387 | 0.990800 | 2.322387 | 9.923290 | 2.072285 | 9.923290 | 999 |
import matplotlib.pyplot as plt
def plot_history(history):
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
plt.figure(figsize=(8,12))
plt.subplot(2,1,1)
plt.xlabel('Epoch')
plt.ylabel('Mean Abs Error [MPG]')
plt.plot(hist['epoch'], hist['mae'],
label='Train Error')
plt.plot(hist['epoch'], hist['val_mae'],
label = 'Val Error')
plt.ylim([0,5])
plt.legend()
plt.subplot(2,1,2)
plt.xlabel('Epoch')
plt.ylabel('Mean Square Error [$MPG^2$]')
plt.plot(hist['epoch'], hist['mse'],
label='Train Error')
plt.plot(hist['epoch'], hist['val_mse'],
label = 'Val Error')
plt.ylim([0,20])
plt.legend()
plt.show()
plot_history(history)
train과 val error의 차이가 많이 나서 지정된 에포크 횟수동안 성능향상이 없으면 자동으로 훈련을 멈춤
model = build_model()
# patience 매개변수는 성능 향상을 체크할 에포크 횟수입니다
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)
history = model.fit(normed_train_data, train_labels, epochs=EPOCHS,
validation_split = 0.2, verbose=0, callbacks=[early_stop, PrintDot()])
plot_history(history)
....................................................................................................
...........................................................
테스트세트로 평가
loss, mae, mse = model.evaluate(normed_test_data, test_labels, verbose=2)
print("테스트 세트의 평균 절대 오차: {:5.2f} MPG".format(mae))
3/3 - 0s - loss: 6.5907 - mae: 1.9264 - mse: 6.5907
테스트 세트의 평균 절대 오차: 1.93 MPG
예측
- 테스트 세트(실제 y)에서 MPG값 예측 비교
test_predictions = model.predict(normed_test_data).flatten()
plt.scatter(test_labels, test_predictions)
plt.xlabel('True Values [MPG]')
plt.ylabel('Predictions [MPG]')
plt.axis('equal')
plt.axis('square')
plt.xlim([0,plt.xlim()[1]])
plt.ylim([0,plt.ylim()[1]])
_ = plt.plot([-100, 100], [-100, 100])
error = test_predictions - test_labels
plt.hist(error, bins = 25)
plt.xlabel("Prediction Error [MPG]")
_ = plt.ylabel("Count")
참고: TensorFlow 자동차 연비 예측하기 : 회귀
댓글남기기