# COUNT THE NUMBER OF TIMES WORDS OCCUR IN A FILE. # # R. M. Neal, 2004. # COUNT WORDS IN A FILE. Takes the name of a file as an argument, defaulting # to "", which means entering text by typing to the R console. Returns a # data frame with variables "word" and "count", where "word" contains the # distinct words found in the file, in alphabetical order, and "count" contains # the count of how many times that word occurred (which will be at least 1, # since only words that occur are entered into the data frame). # # Words are defined as a sequence of non-whitespace character separated by # whitespace characters (space, newline, and tab). This unfortunately means # that punctuation marks are considered part of a word. (Actually, the presence # of quote marks modifies this - it really does whatever the R "scan" procedure # does as far as separating items goes.) # # An error is reported if the file is empty (has no words). word.count <- function(file="") { # Read all the words and sort them alphabetically. words.read <- sort(scan(file,what="")) if (length(words.read)<1) { stop("No words in file") } # Initialize vectors "word" and "count" to have just the first word, with # count of one. Set "n" to the number of words found so far. word <- c(words.read[1]) count <- c(1) n <- 1 # Go through the remaining words, incrementing counts when the word is the # same as the previous one, and creating new word/count pairs when a new # word is encountered. if (length(words.read)>1) { for (i in 2:length(words.read)) { if (words.read[i]==word[n]) { count[n] <- count[n] + 1 } else { word <- c (word, words.read[i]) count <- c (count, 1) n <- n + 1 } } } # Put the word and count vectors together in a data frame to return. data.frame(word,count) }