Text Mining2

Posted by Lisa on December 9, 2015
Text Mining2
library(tidyr)
library(dplyr)
library(tm)
library(stringr)
library(DT)
rm(list=ls())
radiology=read.csv("radiology_results.csv",header = TRUE, stringsAsFactors=FALSE) #set strings As Strings
rad=radiology #make a copy of data
#split the TEST_ORDERED into single test
rad1= rad %>% separate(TEST_ORDERED, c("T1", "T2","T3"), sep=",", extra = "drop", fill = "right")
#datatable(head(rad1[,-7],10))
#rad2=filter(rad1, T1 == "XR Abdomen"|T2 == "XR Abdomen"|T3 == "XR Abdomen")
rad2=filter(rad1, T1 == "XR Abdomen")
rad2=rad2[,c(1:3,6:7)]

Extract key word from text

Next, Let’s try to decide whether those patients with “XR Abdominal” test have “free air”, “no free air” or “no evidence of both”.

For starters, lets extract all the text related with “free air”.

  • Abdominal result: ABDOMEN
  • Related free air: [A-Za-z]*air|free air|No free air|no free air
  • Negative view: No evidence|no evidence

There are some regular expressions. Let’s make regex for each thing we want to extract:

pattern=c("ABDOMEN","[A-Za-z]*air|free air|No free air|no free air","No evidence|no evidence")
pattern=paste(pattern, sep = "|", collapse="|")

#orignial text
text=rad2$RESULT
#Let's extract all the words from the text using str_extract_all (like grep(...,value = TRUE))
words <- str_extract_all(text, "[A-Za-z]{2,}")
#Let's only extract the words related to "free air"
freeair <- str_extract_all(text, pattern)
head(freeair,10)
## [[1]]
## [1] "ABDOMEN"     "no evidence" "freeair"    
## 
## [[2]]
## [1] "no evidence" "air"        
## 
## [[3]]
## [1] "no evidence" "free air"    "no evidence"
## 
## [[4]]
## [1] "intraperitonealair"
## 
## [[5]]
## character(0)
## 
## [[6]]
## character(0)
## 
## [[7]]
## [1] "no evidence"
## 
## [[8]]
## [1] "ABDOMEN"     "No free air" "air"        
## 
## [[9]]
## [1] "no evidence" "air"        
## 
## [[10]]
## [1] "ABDOMEN"     "no evidence" "ABDOMEN"     "No evidence"
#Extract no free air patients
pattern=c("[Nn]*o free [A-Za-z]*air|[Nn]*o free air",
          "[Nn]*o evidence of free [A-Za-z]*air",
          "[Nn]*o evidence of [a-z]* [A-Za-z]*air|[Nn]*o evidence of any free air")
pattern=paste(pattern, sep = "|", collapse="|")
nofreeair <- str_extract_all(text, pattern)
head(nofreeair,15)
## [[1]]
## [1] "no evidence of any freeair"
## 
## [[2]]
## character(0)
## 
## [[3]]
## [1] "no evidence of free air"
## 
## [[4]]
## [1] "no free intraperitonealair"
## 
## [[5]]
## character(0)
## 
## [[6]]
## character(0)
## 
## [[7]]
## character(0)
## 
## [[8]]
## [1] "No free air"
## 
## [[9]]
## character(0)
## 
## [[10]]
## character(0)
## 
## [[11]]
## character(0)
## 
## [[12]]
## [1] "no free air"
## 
## [[13]]
## character(0)
## 
## [[14]]
## character(0)
## 
## [[15]]
## character(0)
#There are 5875 patients have no free air
length(nofreeair[lapply(nofreeair,length)>0])
## [1] 5875

To be continue,…

Extract words with capital letters

I found each text is separated by some paragraphs which can be summarised to several topics in capital letters, such as “CLINICAL INDICATION”, “ABDOMEN”, “CERVICAL SPINE CLINICAL” and etc. So let’s have a try for fun.

library("wordcloud")
library("wesanderson")
topic <- str_extract_all(text, "[A-Z]{2,} [A-Z]{2,} [A-Z]{2,}|[A-Z]{2,} [A-Z]{2,}|[A-Z]{2,}")
head(topic)
## [[1]]
## [1] "ABDOMEN" "CHEST"  
## 
## [[2]]
## [1] "PHS"
## 
## [[3]]
## [1] "INDICATION" "CT"         "FINDINGS"   "IMPRESSION" "CT"        
## [6] "CT"        
## 
## [[4]]
## character(0)
## 
## [[5]]
## [1] "CHEST"            "ABDOMINAL SERIES" "CONCLUSION"      
## 
## [[6]]
## [1] "CHEST AND ABDOMINAL" "RAY SERIES"          "INDICATION"         
## [4] "FINDINGS"
word_counts <- unlist(topic) %>% table %>% data.frame
names(word_counts) <- c("word", "count")
word_counts %>%
    top_n(25)%>%
    arrange(count)
##                           word count
## 1                   PA CXR AND   209
## 2         ABDOMINAL SERIES AND   256
## 3          CLINICAL INDICATION   264
## 4                          IUD   268
## 5                   COMPARISON   292
## 6                         COPD   297
## 7             COMPARISON STUDY   323
## 8  ABDOMINAL SERIES INDICATION   402
## 9                           NG   404
## 10                  CONCLUSION   449
## 11           CHEST SINGLE VIEW   487
## 12                         RAY   539
## 13            CLINICAL HISTORY   574
## 14           CHEST AND ABDOMEN   639
## 15                          AP   708
## 16                         CXR   761
## 17                     HISTORY  1374
## 18                          PA  1439
## 19            ABDOMINAL SERIES  1528
## 20                  INDICATION  1838
## 21                       CHEST  2172
## 22                          CT  2686
## 23                     ABDOMEN  4153
## 24                    FINDINGS  6076
## 25                  IMPRESSION  6251
pal <- wes_palette(name = "Zissou", 25, type ="continuous") %>% as.character()

word_counts %>%
    top_n(25) %>%
    with(wordcloud(word, count, ordered.colors = TRUE, colors = pal, use.r.layout = TRUE,scale = c(2.5, 0.5)))