Chapter 2 R for the impatient

2.1 Useful commands

Command Description
help() Obtain documentation for a given R command
example() View some examples on the use of a command
c(), scan() Enter data manually to a vector in R
seq() Make arithmetic progression vector
rep() Make vector of repeated values
data() Load (often into a data.frame) built-in dataset
View() View dataset in a spreadsheet-type format
str() Display internal structure of an R object
read.csv(), read.table() Load into a data.frame an existing data file
library(), require() Make available an R add-on package
dim() See dimensions (# of rows/cols) of data.frame
length() Give length of a vector
ls() Lists memory contents
rm() Removes an item from memory
names() Lists names of variables in a data.frame
hist() Command for producing a histogram
histogram() Lattice command for producing a histogram
stem() Make a stem plot
table() List all values of a variable with frequencies
xtabs() Cross-tabulation tables using formulas
mosaicplot() Make a mosaic plot
cut() Groups values of a variable into larger bins
mean(), median() Identify “center” of distribution
by() apply function to a column split by factors
summary() Display 5-number summary and mean
var(), sd() Find variance, sd of values in vector
sum() Add up all values in a vector
quantile() Find the position of a quantile in a dataset
barplot() Produces a bar graph
barchart() Lattice command for producing bar graphs
boxplot() Produces a boxplot
bwplot() Lattice command for producing boxplots
plot() Produces a scatterplot
xyplot() Lattice command for producing a scatterplot
lm() Determine the least-squares regression line
anova() Analysis of variance (can use on results of lm() )
predict() Obtain predicted values from linear model
nls() estimate parameters of a nonlinear model
residuals() gives (observed - predicted) for a model fit to data
sample() take a sample from a vector of data
replicate() repeat some process a set number of times
cumsum() produce running total of values for input vector
ecdf() builds empirical cumulative distribution function
dbinom(), etc. tools for binomial distributions
dpois(), etc. tools for Poisson distributions
pnorm(), etc. tools for normal distribution
sqt(), etc. tools for student t distributions
pchisq(), etc. tools for chi-square distributions
binom.test() hypothesis test and confidence interval for 1 proportion
prop.test() inference for 1 proportion using normal approx.
chisq.test() carries out a chi-square test
fisher.test() Fisher test for contingency table
t.test() student t test for inference on population mean
qqnorm(), qqline() tools for checking normality
addmargins() adds marginal sums to an existing table
prop.table() compute proportions from a contingency table
par() query and edit graphical settings
power.t.test() power calculations for 1- and 2-samplet
anova() compute analysis of variance table for fitted model

2.2 Examples

mydata = read.csv("https://tinyurl.com/teendata1")
View(mydata)
dim(mydata)
names(mydata)
mean(mydata$hsgradrate)
median(mydata$hsgradrate)
min(mydata$hsgradrate)
max(mydata$hsgradrate)
hist(mydata$childpov, n=15, freq=F, col="red")
require(lattice)
data(iris)
histogram(iris$Sepal.Length, breaks=seq(4,8,.25))
histogram(~ Sepal.Length, data=iris, main="Iris Sepals", xlab="Length")
histogram(~ Sepal.Length | Species, data=iris, col="red")
histogram(~ Sepal.Length | Species, data=iris, n=15, layout=c(1,3))
plot(x=mydata$childpov,y=mydata$hsgradrate, col="red")

2.3 Arithmetic

# addition
8 + 3 

# divide
27 / 5

# calculate the absolute value of cosine of -pi ~3.141592...
cos(-pi)

# absolute value of -2 to the power 3
abs(-2^3)

# Square root of 4068289
sqrt(4068289)

# assigning the result of 8 + 3 to the variable x
x = 8 + 3

# assign number 3 to variable y
y = 3

# adding x and y
x+y 

# multiplying x and y 
x*y

# assigning the result of x multiplied by y to variable z 
z = x * y 

# we can wee what's inside y by looking in to values pane in the environment
# or just by calling z 
z

# variables are case-sensitive while we have defined the variable x, X (uppercase) is not defined
Z
# list the objects in the environment e.g. data, variables, functions, etc. 
ls() 

# we can remove an object from the environment by rm()  function 
rm(y)

# now variable/object y is not defined i.e. deleted
ls() 
# make a variable called "ourdata" containing a vector of numbers
ourdata = c(-3,2,0,1.5,4,1,3,8)

#what is the length of our data? 
length(ourdata)

# call the fifth element in ourdata
ourdata[5]

# calculate mean
mean(ourdata)

# calculate median
median(ourdata)

# calculate range
range(ourdata)

# calculate standard deviation
sd(ourdata)

# calculate variance
var(ourdata)

# using summary function to get basic stats
summary(ourdata)

# load the pscych library
install.packages("psych")
library(psych)
describe(ourdata)
#loading the data with read.csv()  command and assigning to mydata 
mydata=read.csv("http://people.fas.harvard.edu/~mparzen/stat104/cars10.csv")

# getting the name of columns 
names(mydata)

# getting the dimension of the dataset
dim(mydata)

## Acessing data or a variable 

# Using the \$ to choose a column dataframename$colname
# this will bring back a list of values from "mydata" dataframe calling the "mpg" column 
# the number in the brackets indicate index of values just to make life easier. 
mydata$mpg

# in case you wanted to confirm the list index numbers :) 
View(mydata)

# median of MPGs
median(mydata$mpg)

# Using attach ()  and detach()  
attach(mydata)

# median of MPGs just by calling the "attached" dataframe
median(mpg)

# standard deviation of MPGs
sd(mpg)

# it is very important to detach()  especially while working with multiple dataframes.
detach(mydata)

# now R might complain that object mpg is not found 
sd(mpg)

# we still can ue the $ to access the mpg column
sd(mydata$mpg) 

#calling the describe()  function on combination of MPG, Price, and Length columns
describe(mydata[,c("mpg","price", "length")])
# Assign values to x and y 
x <- c(1,2,3)
y <- c(1,4,9)

# making a simple plot 
plot(x,y)

# I encourage you to change the values of pch (Appendix 3 has a list of shapes we can use), cex, main, sub, xlab, and ylab: 
plot(x,y, xlab = "x", ylab="y", pch = 19, cex=0.8, col = "blue", xlim = c(0,4), ylim=c(0,10), main="Our First Plot!",sub = "STAT 100")


# attaching mydata to the environment 
attach(mydata)

#plot price based on weight
plot(weight,price,main="Price of a car versus Weight", pch = 19, cex= 0.8,xlab= "Weight (lbs)", ylab="Price ($)")


# Making Histograms
hist(price, col = "lightgreen") 
# Transforming the price data
invprice=1/price
logprice=log(price) #simple log by default is the natural log 
sqrtprice=sqrt(price)

# ask for a 2 x 2 matrix pattern for plots
par(mfrow=c(2,2))

# make histogram of price and 3 transformed price data points
hist(price,main="Price")
hist(invprice,main="1/Price")
hist(sqrtprice,main="sqrt(Price)")
hist(logprice,main="log(Price)")

# escape from par(mfrow=c(2,2))
dev.off() 
# making dot plots ~ stacked scatter plot simillar to Stem and Leaf Plots
# loading BHH2 Package, where dotPlot()  function resides. 
install.packages("BHH2")
library(BHH2)

# Ask for a 2 x 2 matrix pattern in Parameters
par(mfrow=c(2,2))

# making dot plots of price and the 3 transformed price data points
dotPlot(price,main="Price")
dotPlot(invprice,main="1/Price")
dotPlot(sqrtprice,main="sqrt(Price)")
dotPlot(logprice,main="log(Price)")

# escape from par(mfrow=c(2,2)) 
dev.off() 
# Making Boxplots
boxplot(mpg, main="Boxplot of MPGs")

# I would encourage you to change the horizontal = TRUE to horizontal = FALSE and replot! 
boxplot(mpg[foreign=="Foreign"],mpg[foreign=="Domestic"],horizontal=TRUE,names=c("Foreign","Domestic"),main="MPG by Origin of Car", col = c("blue","red"))

2.4 R for the very impatient

table(mydata$childpov, mydata$hsgradrate)
xtabs(~hsgradrate, data=mydata)
xtabs(~hsgradrate + childpov, data = mydata)
mosaicplot(~hsgradrate + childpov, data = mydata)