翁世峰: CNN分類PNEUMONIA

R_class

###1. 設定目錄資料夾看照片 —- 設定目錄資料夾固定會分train 跟test 兩個資料夾, 而且每個下面會再依想分類的資料夾分別放置要訓練的資料

direction_train="d:/data_set/chest_xray/train/"
direction_test="d:/data_set/chest_xray/test/"

direction_train_NORMAL="d:/data_set/chest_xray/train/NORMAL/"
direction_train_PNEUMONIA="d:/data_set/chest_xray/train/PNEUMONIA/"

direction_test_NORMAL="d:/data_set/chest_xray/test/NORMAL/"
direction_test_PNEUMONIA="d:/data_set/chest_xray/test/PNEUMONIA/"

#讀目錄下的所有檔案名稱

train_normal_list<-list.files(path=direction_train_NORMAL, pattern = ".jpeg" )
train_PNEUMONIA_list<-list.files(path=direction_train_PNEUMONIA, pattern = ".jpeg" )

test_normal_list<-list.files(path=direction_test_NORMAL, pattern = ".jpeg" )
test_PNEUMONIA_list<-list.files(path=direction_test_PNEUMONIA, pattern = ".jpeg" )

##看照片 image_to_array: 把每個影像轉成2維array, 這邊是180*200的array as.raster 根據array資料將其轉成 raster 物件, 之後才能畫可能需要進到python 環境裡面, pip install Pillow or conda install Pillow or (conda install PIL)

#先抓一張圖來看

image_load(paste0(direction_train_PNEUMONIA,"person1467_virus_2544.jpeg"),
              target_size=c(120,150))%>% 
  image_to_array %>% as.raster(255) %>% plot

#抓24張來看

par(mfrow = c(4, 6), pty = "s", mar = c(0.1, 0.1, 0.01, 0.01))

for(i in 1:24) {
    paste0(direction_train,"NORMAL/", train_normal_list[i]) %>% 
  image_load(target_size=c(120,150)) %>% 
  image_to_array %>% as.raster(255) %>% plot
}

####2. 寫函數讀取並壓縮圖片—- generator是一個生成器，主要是訓練自己的資料，並且資料非常多的時候可以不用把資料全部載入進記憶體，而是用生成器自己一點點讀取。大大提高的執行效率。 image_data_generator 可以放入自己想要的資料前處理, 這邊只做了標準化。分一個訓練用的, 一個之後test用的, 可跑ROC

Normalization #做神經網路的時候，值要介於在0~1之間的灰階數字(255)跑起來才快

Generate <- function(x){
#flow_images_from_directory: Generates Batches Of Data From Images In A Directory 
#這邊的x放train的資料夾  
  flow_images_from_directory(x,              #要轉換的資料夾
  image_data_generator(rescale = 1/255),     #將圖片像素值標準化至0~1之間
  target_size = c(120, 150),                 #壓縮圖片成一致大小
  batch_size = 32,                           #一次抽取的樣本數
  #class_mode = "categorical",                #圖片為類別, 如果下面選用sigmoid, 則改用binary
  class_mode = "binary",
  shuffle = T)
  }

test 資料夾也要做一個, 但是batch_size=1,且不能隨機，因為要一個檔案一個檔案的配出來這邊的x放test的資料夾

test_Generate <- function(x) { 
                   flow_images_from_directory( x,            #要轉換的資料夾
                   image_data_generator(rescale = 1/255),    #將圖片像素值標準化至0~1之間
                   target_size = c(120, 150),                #壓縮圖片成一致大小
                   batch_size = 1,                           #一次抽取的樣本數
                   #class_mode = "categorical",                #圖片為類別, 如果下面選用sigmoid, 則改用binary
                   class_mode = "binary",
                   shuffle = FALSE)                          #不能為隨機選取, 要一張一張選      
                   }

####3. 使用Keras建立CNN—- filters: filter個數# kernel_size: filter的長寬

mymodel.cnn1 <- keras_model_sequential() %>%  
  layer_conv_2d(filters = 32, kernel_size = c(3, 3), activation = "relu",   
                input_shape = c(120, 150, 3)) %>%     #  input_shape圖片的像素                
  layer_max_pooling_2d(pool_size = c(2, 2)) %>% 
  layer_conv_2d(filters = 64, kernel_size = c(3, 3), activation = "relu") %>% 
  layer_max_pooling_2d(pool_size = c(2, 2)) %>% 
  layer_conv_2d(filters = 128, kernel_size = c(3, 3), activation = "relu") %>% 
  layer_max_pooling_2d(pool_size = c(2, 2)) %>% 
  layer_dropout(rate = 0.2) %>%
  layer_conv_2d(filters = 128, kernel_size = c(3, 3), activation = "relu") %>% 
  layer_max_pooling_2d(pool_size = c(2, 2)) %>% 
  layer_flatten() %>%                                                # 將特徵展開為一維向量
  layer_dense(units = 512, activation = "relu") %>% 
  layer_dense(units = 1, activation = "sigmoid")
  #也可以用下面的softmax
  #layer_dense(units = 2, activation = "softmax")

summary(mymodel.cnn1)

## Model: "sequential"
## ________________________________________________________________________________
## Layer (type)                        Output Shape                    Param #     
## ================================================================================
## conv2d (Conv2D)                     (None, 118, 148, 32)            896         
## ________________________________________________________________________________
## max_pooling2d (MaxPooling2D)        (None, 59, 74, 32)              0           
## ________________________________________________________________________________
## conv2d_1 (Conv2D)                   (None, 57, 72, 64)              18496       
## ________________________________________________________________________________
## max_pooling2d_1 (MaxPooling2D)      (None, 28, 36, 64)              0           
## ________________________________________________________________________________
## conv2d_2 (Conv2D)                   (None, 26, 34, 128)             73856       
## ________________________________________________________________________________
## max_pooling2d_2 (MaxPooling2D)      (None, 13, 17, 128)             0           
## ________________________________________________________________________________
## dropout (Dropout)                   (None, 13, 17, 128)             0           
## ________________________________________________________________________________
## conv2d_3 (Conv2D)                   (None, 11, 15, 128)             147584      
## ________________________________________________________________________________
## max_pooling2d_3 (MaxPooling2D)      (None, 5, 7, 128)               0           
## ________________________________________________________________________________
## flatten (Flatten)                   (None, 4480)                    0           
## ________________________________________________________________________________
## dense (Dense)                       (None, 512)                     2294272     
## ________________________________________________________________________________
## dense_1 (Dense)                     (None, 1)                       513         
## ================================================================================
## Total params: 2,535,617
## Trainable params: 2,535,617
## Non-trainable params: 0
## ________________________________________________________________________________

####4. 設定模型參數—-

mymodel.cnn1 %>% compile(
  #loss = "categorical_crossentropy",   #softmax的時候用
  #用sigmoid時用下面這個
  loss = "binary_crossentropy",               #目標損失函數
  optimizer = optimizer_rmsprop (lr = 0.001),      #訓練優化方法
  metrics = c("acc")                          #算acc出來
)

###5. 開始訓練—- fit_generator()訓練用的函數 https://kknews.cc/zh-tw/news/pl2nloz.html fit()和predict()用於可以載到ram中的較小的數據集大多數實際用例，幾乎所有數據集都很大，不能一次加載到ram中。解決方案是將fit_generator()和predict_generator()與數據生成器函數一起使用，

cnn.fit1 <- mymodel.cnn1 %>%
  fit_generator(Generate(direction_train),        #利用先前定義好的函數輸入train的資料
                steps_per_epoch = 20,             #每次迭代輸入幾次資料
                epochs = 20,                      #迭代次數
                #驗證的資料集, 應該會是另外有一個資料夾，但我這邊偷工減料, 直接拿test set來當
                validation_data = Generate(direction_test), 
                validation_steps = 30,
                verbose=1                   #顯示訓練過程
                )

#顯示結果

plot(cnn.fit1)

## `geom_smooth()` using formula 'y ~ x'

##6. 預測模型結果 —-

Make predictions on the test set,所以需要用到test_generate的函數, step 就是看幾個檔案了需要一張一張抓,step也可以這樣寫 steps = length(list.files(direction_test, recursive = T))

mymodel.cnn_result <- predict_generator(mymodel.cnn1,
                                        test_Generate(direction_test),verbose=1,
                                        steps = 624)

檔案的名稱, 他還會抓看是在哪一個資料夾下面的

head(test_Generate(direction_test)$filenames)

## [1] "NORMAL\\IM-0001-0001.jpeg" "NORMAL\\IM-0003-0001.jpeg"
## [3] "NORMAL\\IM-0005-0001.jpeg" "NORMAL\\IM-0006-0001.jpeg"
## [5] "NORMAL\\IM-0007-0001.jpeg" "NORMAL\\IM-0009-0001.jpeg"

Do some tidying

mymodel.cnn_value <- data.frame(test_Generate(direction_test)$filenames)
mymodel.cnn_value$prob_pneumonia <- mymodel.cnn_result[,1] #若是用softmax, 這邊要改用mymodel.cnn2_result[,2] 
colnames(mymodel.cnn_value) <- c('Filename', 'Prob_Pneumonia')


mymodel.cnn_value$pred <- 0
mymodel.cnn_value$pred[mymodel.cnn_value$Prob_Pneumonia >= 0.5] <- 1

predictions$Filename 裡面會把資料夾的名字放上去

mymodel.cnn_value$real <- 0
mymodel.cnn_value$real[grep("PNEUMONIA",mymodel.cnn_value$Filename)] <- 1

#這樣寫也可以 #mymodel.cnn_value<-cbind(value=mymodel.cnn_result[,1],filename=c(test_normal_list,test_PNEUMONIA_list), real=c(rep.int(0,times=length(test_normal_list)), rep.int(1,times=length(test_PNEUMONIA_list))))

###7.ROC —-

library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

cnn.roc<-roc(mymodel.cnn_value$real,mymodel.cnn_value$Prob_Pneumonia, plot=TRUE, print.auc = TRUE, legacy.axes=TRUE)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

###8. confuse matrix—-

confus_matrix_cnn<-table(mymodel.cnn_value$pred,mymodel.cnn_value$real)

confus_matrix_cnn

##    
##       0   1
##   0 115   4
##   1 119 386

###9.算accuracy —-

accuracy_cnn <- sum(diag(confus_matrix_cnn)) / sum(confus_matrix_cnn)

accuracy_cnn

## [1] 0.8028846

翁世峰

2020年9月22日星期二

CNN分類PNEUMONIA

R_class

沒有留言:

張貼留言

2020年9月22日 星期二

CNN分類PNEUMONIA

R_class

沒有留言:

張貼留言

2020年9月22日星期二