R_class
###1. 設定目錄資料夾看照片 —- 設定目錄資料夾 固定會分train 跟test 兩個資料夾, 而且每個下面會再依想分類的資料夾分別放置要訓練的資料
direction_train="d:/data_set/chest_xray/train/"
direction_test="d:/data_set/chest_xray/test/"
direction_train_NORMAL="d:/data_set/chest_xray/train/NORMAL/"
direction_train_PNEUMONIA="d:/data_set/chest_xray/train/PNEUMONIA/"
direction_test_NORMAL="d:/data_set/chest_xray/test/NORMAL/"
direction_test_PNEUMONIA="d:/data_set/chest_xray/test/PNEUMONIA/"
#讀目錄下的所有檔案名稱
train_normal_list<-list.files(path=direction_train_NORMAL, pattern = ".jpeg" )
train_PNEUMONIA_list<-list.files(path=direction_train_PNEUMONIA, pattern = ".jpeg" )
test_normal_list<-list.files(path=direction_test_NORMAL, pattern = ".jpeg" )
test_PNEUMONIA_list<-list.files(path=direction_test_PNEUMONIA, pattern = ".jpeg" )
##看照片 image_to_array: 把每個影像轉成2維array, 這邊是180*200的array as.raster 根據array資料將其轉成 raster 物件, 之後才能畫 可能需要進到python 環境裡面, pip install Pillow or conda install Pillow or (conda install PIL)
#先抓一張圖來看
image_load(paste0(direction_train_PNEUMONIA,"person1467_virus_2544.jpeg"),
target_size=c(120,150))%>%
image_to_array %>% as.raster(255) %>% plot
#抓24張來看
par(mfrow = c(4, 6), pty = "s", mar = c(0.1, 0.1, 0.01, 0.01))
for(i in 1:24) {
paste0(direction_train,"NORMAL/", train_normal_list[i]) %>%
image_load(target_size=c(120,150)) %>%
image_to_array %>% as.raster(255) %>% plot
}
####2. 寫函數讀取並壓縮圖片—- generator是一個生成器,主要是訓練自己的資料, 並且資料非常多的時候可以不用把資料全部載入進記憶體,而是用生成器自己一點點讀取。大大提高的執行效率。 image_data_generator 可以放入自己想要的資料前處理, 這邊只做了標準化。 分一個訓練用的, 一個之後test用的, 可跑ROC
Normalization #做神經網路的時候,值要介於在0~1之間的灰階數字(255)跑起來才快
Generate <- function(x){
#flow_images_from_directory: Generates Batches Of Data From Images In A Directory
#這邊的x放train的資料夾
flow_images_from_directory(x, #要轉換的資料夾
image_data_generator(rescale = 1/255), #將圖片像素值標準化至0~1之間
target_size = c(120, 150), #壓縮圖片成一致大小
batch_size = 32, #一次抽取的樣本數
#class_mode = "categorical", #圖片為類別, 如果下面選用sigmoid, 則改用binary
class_mode = "binary",
shuffle = T)
}
test 資料夾也要做一個, 但是batch_size=1,且不能隨機, 因為要一個檔案一個檔案的配出來 這邊的x放test的資料夾
test_Generate <- function(x) {
flow_images_from_directory( x, #要轉換的資料夾
image_data_generator(rescale = 1/255), #將圖片像素值標準化至0~1之間
target_size = c(120, 150), #壓縮圖片成一致大小
batch_size = 1, #一次抽取的樣本數
#class_mode = "categorical", #圖片為類別, 如果下面選用sigmoid, 則改用binary
class_mode = "binary",
shuffle = FALSE) #不能為隨機選取, 要一張一張選
}
####3. 使用Keras建立CNN—- filters: filter個數# kernel_size: filter的長寬
mymodel.cnn1 <- keras_model_sequential() %>%
layer_conv_2d(filters = 32, kernel_size = c(3, 3), activation = "relu",
input_shape = c(120, 150, 3)) %>% # input_shape圖片的像素
layer_max_pooling_2d(pool_size = c(2, 2)) %>%
layer_conv_2d(filters = 64, kernel_size = c(3, 3), activation = "relu") %>%
layer_max_pooling_2d(pool_size = c(2, 2)) %>%
layer_conv_2d(filters = 128, kernel_size = c(3, 3), activation = "relu") %>%
layer_max_pooling_2d(pool_size = c(2, 2)) %>%
layer_dropout(rate = 0.2) %>%
layer_conv_2d(filters = 128, kernel_size = c(3, 3), activation = "relu") %>%
layer_max_pooling_2d(pool_size = c(2, 2)) %>%
layer_flatten() %>% # 將特徵展開為一維向量
layer_dense(units = 512, activation = "relu") %>%
layer_dense(units = 1, activation = "sigmoid")
#也可以用下面的softmax
#layer_dense(units = 2, activation = "softmax")
summary(mymodel.cnn1)
## Model: "sequential"
## ________________________________________________________________________________
## Layer (type) Output Shape Param #
## ================================================================================
## conv2d (Conv2D) (None, 118, 148, 32) 896
## ________________________________________________________________________________
## max_pooling2d (MaxPooling2D) (None, 59, 74, 32) 0
## ________________________________________________________________________________
## conv2d_1 (Conv2D) (None, 57, 72, 64) 18496
## ________________________________________________________________________________
## max_pooling2d_1 (MaxPooling2D) (None, 28, 36, 64) 0
## ________________________________________________________________________________
## conv2d_2 (Conv2D) (None, 26, 34, 128) 73856
## ________________________________________________________________________________
## max_pooling2d_2 (MaxPooling2D) (None, 13, 17, 128) 0
## ________________________________________________________________________________
## dropout (Dropout) (None, 13, 17, 128) 0
## ________________________________________________________________________________
## conv2d_3 (Conv2D) (None, 11, 15, 128) 147584
## ________________________________________________________________________________
## max_pooling2d_3 (MaxPooling2D) (None, 5, 7, 128) 0
## ________________________________________________________________________________
## flatten (Flatten) (None, 4480) 0
## ________________________________________________________________________________
## dense (Dense) (None, 512) 2294272
## ________________________________________________________________________________
## dense_1 (Dense) (None, 1) 513
## ================================================================================
## Total params: 2,535,617
## Trainable params: 2,535,617
## Non-trainable params: 0
## ________________________________________________________________________________
####4. 設定模型參數—-
mymodel.cnn1 %>% compile(
#loss = "categorical_crossentropy", #softmax的時候用
#用sigmoid時用下面這個
loss = "binary_crossentropy", #目標損失函數
optimizer = optimizer_rmsprop (lr = 0.001), #訓練優化方法
metrics = c("acc") #算acc出來
)
###5. 開始訓練—- fit_generator()訓練用的函數 https://kknews.cc/zh-tw/news/pl2nloz.html fit()和predict()用於可以載到ram中的較小的數據集 大多數實際用例,幾乎所有數據集都很大,不能一次加載到ram中。 解決方案是將fit_generator()和predict_generator()與數據生成器函數一起使用,
cnn.fit1 <- mymodel.cnn1 %>%
fit_generator(Generate(direction_train), #利用先前定義好的函數輸入train的資料
steps_per_epoch = 20, #每次迭代輸入幾次資料
epochs = 20, #迭代次數
#驗證的資料集, 應該會是另外有一個資料夾,但我這邊偷工減料, 直接拿test set來當
validation_data = Generate(direction_test),
validation_steps = 30,
verbose=1 #顯示訓練過程
)
#顯示結果
plot(cnn.fit1)
## `geom_smooth()` using formula 'y ~ x'
##6. 預測模型結果 —-
Make predictions on the test set,所以需要用到test_generate的函數, step 就是看幾個檔案了 需要一張一張抓,step也可以這樣寫 steps = length(list.files(direction_test, recursive = T))
mymodel.cnn_result <- predict_generator(mymodel.cnn1,
test_Generate(direction_test),verbose=1,
steps = 624)
檔案的名稱, 他還會抓看是在哪一個資料夾下面的
head(test_Generate(direction_test)$filenames)
## [1] "NORMAL\\IM-0001-0001.jpeg" "NORMAL\\IM-0003-0001.jpeg"
## [3] "NORMAL\\IM-0005-0001.jpeg" "NORMAL\\IM-0006-0001.jpeg"
## [5] "NORMAL\\IM-0007-0001.jpeg" "NORMAL\\IM-0009-0001.jpeg"
Do some tidying
mymodel.cnn_value <- data.frame(test_Generate(direction_test)$filenames)
mymodel.cnn_value$prob_pneumonia <- mymodel.cnn_result[,1] #若是用softmax, 這邊要改用mymodel.cnn2_result[,2]
colnames(mymodel.cnn_value) <- c('Filename', 'Prob_Pneumonia')
mymodel.cnn_value$pred <- 0
mymodel.cnn_value$pred[mymodel.cnn_value$Prob_Pneumonia >= 0.5] <- 1
predictions$Filename 裡面會把資料夾的名字放上去
mymodel.cnn_value$real <- 0
mymodel.cnn_value$real[grep("PNEUMONIA",mymodel.cnn_value$Filename)] <- 1
#這樣寫也可以 #mymodel.cnn_value<-cbind(value=mymodel.cnn_result[,1],filename=c(test_normal_list,test_PNEUMONIA_list), real=c(rep.int(0,times=length(test_normal_list)), rep.int(1,times=length(test_PNEUMONIA_list))))
###7.ROC —-
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
cnn.roc<-roc(mymodel.cnn_value$real,mymodel.cnn_value$Prob_Pneumonia, plot=TRUE, print.auc = TRUE, legacy.axes=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
###8. confuse matrix—-
confus_matrix_cnn<-table(mymodel.cnn_value$pred,mymodel.cnn_value$real)
confus_matrix_cnn
##
## 0 1
## 0 115 4
## 1 119 386
###9.算accuracy —-
accuracy_cnn <- sum(diag(confus_matrix_cnn)) / sum(confus_matrix_cnn)
accuracy_cnn
## [1] 0.8028846
沒有留言:
張貼留言