Text Mining2
Lisa
December 9, 2015
library(tidyr)
library(dplyr)
library(tm)
library(stringr)
library(DT)
rm(list=ls())
radiology=read.csv("radiology_results.csv",header = TRUE, stringsAsFactors=FALSE) #set strings As Strings
rad=radiology #make a copy of data
#split the TEST_ORDERED into single test
rad1= rad %>% separate(TEST_ORDERED, c("T1", "T2","T3"), sep=",", extra = "drop", fill = "right")
#datatable(head(rad1[,-7],10))
#rad2=filter(rad1, T1 == "XR Abdomen"|T2 == "XR Abdomen"|T3 == "XR Abdomen")
rad2=filter(rad1, T1 == "XR Abdomen")
rad2=rad2[,c(1:3,6:7)]
Extract key word from text
Next, Let’s try to decide whether those patients with “XR Abdominal” test have “free air”, “no free air” or “no evidence of both”.
For starters, lets extract all the text related with “free air”.
- Abdominal result:
ABDOMEN
- Related free air:
[A-Za-z]*air|free air|No free air|no free air
- Negative view:
No evidence|no evidence
There are some regular expressions. Let’s make regex for each thing we want to extract:
pattern=c("ABDOMEN","[A-Za-z]*air|free air|No free air|no free air","No evidence|no evidence")
pattern=paste(pattern, sep = "|", collapse="|")
#orignial text
text=rad2$RESULT
#Let's extract all the words from the text using str_extract_all (like grep(...,value = TRUE))
words <- str_extract_all(text, "[A-Za-z]{2,}")
#Let's only extract the words related to "free air"
freeair <- str_extract_all(text, pattern)
head(freeair,10)
## [[1]]
## [1] "ABDOMEN" "no evidence" "freeair"
##
## [[2]]
## [1] "no evidence" "air"
##
## [[3]]
## [1] "no evidence" "free air" "no evidence"
##
## [[4]]
## [1] "intraperitonealair"
##
## [[5]]
## character(0)
##
## [[6]]
## character(0)
##
## [[7]]
## [1] "no evidence"
##
## [[8]]
## [1] "ABDOMEN" "No free air" "air"
##
## [[9]]
## [1] "no evidence" "air"
##
## [[10]]
## [1] "ABDOMEN" "no evidence" "ABDOMEN" "No evidence"
#Extract no free air patients
pattern=c("[Nn]*o free [A-Za-z]*air|[Nn]*o free air",
"[Nn]*o evidence of free [A-Za-z]*air",
"[Nn]*o evidence of [a-z]* [A-Za-z]*air|[Nn]*o evidence of any free air")
pattern=paste(pattern, sep = "|", collapse="|")
nofreeair <- str_extract_all(text, pattern)
head(nofreeair,15)
## [[1]]
## [1] "no evidence of any freeair"
##
## [[2]]
## character(0)
##
## [[3]]
## [1] "no evidence of free air"
##
## [[4]]
## [1] "no free intraperitonealair"
##
## [[5]]
## character(0)
##
## [[6]]
## character(0)
##
## [[7]]
## character(0)
##
## [[8]]
## [1] "No free air"
##
## [[9]]
## character(0)
##
## [[10]]
## character(0)
##
## [[11]]
## character(0)
##
## [[12]]
## [1] "no free air"
##
## [[13]]
## character(0)
##
## [[14]]
## character(0)
##
## [[15]]
## character(0)
#There are 5875 patients have no free air
length(nofreeair[lapply(nofreeair,length)>0])
## [1] 5875
To be continue,…
Extract words with capital letters
I found each text is separated by some paragraphs which can be summarised to several topics in capital letters, such as “CLINICAL INDICATION”, “ABDOMEN”, “CERVICAL SPINE CLINICAL” and etc. So let’s have a try for fun.
library("wordcloud")
library("wesanderson")
topic <- str_extract_all(text, "[A-Z]{2,} [A-Z]{2,} [A-Z]{2,}|[A-Z]{2,} [A-Z]{2,}|[A-Z]{2,}")
head(topic)
## [[1]]
## [1] "ABDOMEN" "CHEST"
##
## [[2]]
## [1] "PHS"
##
## [[3]]
## [1] "INDICATION" "CT" "FINDINGS" "IMPRESSION" "CT"
## [6] "CT"
##
## [[4]]
## character(0)
##
## [[5]]
## [1] "CHEST" "ABDOMINAL SERIES" "CONCLUSION"
##
## [[6]]
## [1] "CHEST AND ABDOMINAL" "RAY SERIES" "INDICATION"
## [4] "FINDINGS"
word_counts <- unlist(topic) %>% table %>% data.frame
names(word_counts) <- c("word", "count")
word_counts %>%
top_n(25)%>%
arrange(count)
## word count
## 1 PA CXR AND 209
## 2 ABDOMINAL SERIES AND 256
## 3 CLINICAL INDICATION 264
## 4 IUD 268
## 5 COMPARISON 292
## 6 COPD 297
## 7 COMPARISON STUDY 323
## 8 ABDOMINAL SERIES INDICATION 402
## 9 NG 404
## 10 CONCLUSION 449
## 11 CHEST SINGLE VIEW 487
## 12 RAY 539
## 13 CLINICAL HISTORY 574
## 14 CHEST AND ABDOMEN 639
## 15 AP 708
## 16 CXR 761
## 17 HISTORY 1374
## 18 PA 1439
## 19 ABDOMINAL SERIES 1528
## 20 INDICATION 1838
## 21 CHEST 2172
## 22 CT 2686
## 23 ABDOMEN 4153
## 24 FINDINGS 6076
## 25 IMPRESSION 6251
pal <- wes_palette(name = "Zissou", 25, type ="continuous") %>% as.character()
word_counts %>%
top_n(25) %>%
with(wordcloud(word, count, ordered.colors = TRUE, colors = pal, use.r.layout = TRUE,scale = c(2.5, 0.5)))