Variables & Measurement

Practice sheet for the SARA Statistics Winter School

Author

SARA Institute of Data Science, Sonipat

Published

January 12, 2025

Modified

November 28, 2024

1 Nominal Variables

1.1 Creating Nominal variables

Gender:

# Create a nominal variable
gender <- factor(c("Male", "Female", "Female", "Male", "Male"))

# Check the structure of the variable
str(gender)

 Factor w/ 2 levels "Female","Male": 2 1 1 2 2

# Print the variable
print(gender)

[1] Male   Female Female Male   Male  
Levels: Female Male

Marital Status:

# Create a nominal variable
marital_status <- factor(c("Single", "Married", "Single", "Divorced", "Widowed"))

# Print the variable
print(marital_status)

[1] Single   Married  Single   Divorced Widowed 
Levels: Divorced Married Single Widowed

# Check the levels
levels(marital_status)

[1] "Divorced" "Married"  "Single"   "Widowed"

Eye Color:

# Create a nominal variable
eye_color <- factor(c("Blue", "Brown", "Green", "Brown", "Blue"))

# Print the variable
print(eye_color)

[1] Blue  Brown Green Brown Blue 
Levels: Blue Brown Green

# Tabulate the frequency of each category
table(eye_color)

eye_color
 Blue Brown Green 
    2     2     1

Dataset:

# Create a data frame with nominal variables
data <- data.frame(
  ID = 1:5,
  Gender = factor(c("Male", "Female", "Female", "Male", "Male")),
  Marital_Status = factor(c("Single", "Married", "Single", "Divorced", "Widowed")),
  Eye_Color = factor(c("Blue", "Brown", "Green", "Brown", "Blue"))
)

# Print the data
print(data)

  ID Gender Marital_Status Eye_Color
1  1   Male         Single      Blue
2  2 Female        Married     Brown
3  3 Female         Single     Green
4  4   Male       Divorced     Brown
5  5   Male        Widowed      Blue

1.2 Visualize Nominal Variables

# Create a nominal variable
marital_status <- factor(c("Single", "Married", "Single", "Divorced", "Widowed"))

# Bar plot for marital status
barplot(table(marital_status),
  main = "Marital Status Distribution",
  col = "lightblue",
  ylab = "Frequency",
  xlab = "Marital Status"
)

1.3 Analyzing Nominal Variables

Frequency Table:

# Frequency table for gender
table(gender)

gender
Female   Male 
     2      3

# Frequency table for marital status
table(marital_status)

marital_status
Divorced  Married   Single  Widowed 
       1        1        2        1

Proportions:

# Proportion of each marital status
prop.table(table(marital_status))

marital_status
Divorced  Married   Single  Widowed 
     0.2      0.2      0.4      0.2

2 Ordinal Variables

2.1 Creating Ordinal Variables

Education Level

Education levels often follow an order (e.g., “High School” < “Bachelor’s” < “Master’s” < “Ph.D.”).

# Create an ordinal variable for education level
education <- factor(c("Bachelor's", "High School", "Master's", 
                      "Ph.D.", "Bachelor's", "High School"),
                    levels = c("High School", "Bachelor's", "Master's", "Ph.D."),
                    ordered = TRUE)

# Print the variable
print(education)

[1] Bachelor's  High School Master's    Ph.D.       Bachelor's  High School
Levels: High School < Bachelor's < Master's < Ph.D.

# Check the structure
str(education)

 Ord.factor w/ 4 levels "High School"<..: 2 1 3 4 2 1

Customer Satisfaction Ratings

Satisfaction levels like “Very Dissatisfied” < “Dissatisfied” < “Neutral” < “Satisfied” < “Very Satisfied” have an inherent order.

# Create an ordinal variable for satisfaction
satisfaction <- factor(c("Satisfied", "Neutral", "Very Satisfied", 
                         "Dissatisfied", "Neutral", "Satisfied"),
                       levels = c("Very Dissatisfied", "Dissatisfied", 
                                  "Neutral", "Satisfied", "Very Satisfied"),
                       ordered = TRUE)

# Print the variable
print(satisfaction)

[1] Satisfied      Neutral        Very Satisfied Dissatisfied   Neutral       
[6] Satisfied     
5 Levels: Very Dissatisfied < Dissatisfied < Neutral < ... < Very Satisfied

# Check the levels
levels(satisfaction)

[1] "Very Dissatisfied" "Dissatisfied"      "Neutral"          
[4] "Satisfied"         "Very Satisfied"

2.2 Visualize Ordinal Variables

Bar Plot

A bar plot is ideal for visualizing ordinal variables:

# Bar plot for satisfaction
barplot(table(satisfaction), 
        main = "Customer Satisfaction Levels", 
        col = c("red", "orange", "yellow", "green", "blue"), 
        xlab = "Satisfaction Level", 
        ylab = "Frequency")

Box Plot with Ordinal Data

If ordinal variables are associated with a numeric variable, box plots can show trends.

# Create a numeric variable (e.g., customer spending)
spending <- c(200, 150, 500, 100, 180, 220)

# Box plot of spending by satisfaction
boxplot(spending ~ satisfaction, 
        main = "Spending by Customer Satisfaction", 
        xlab = "Satisfaction Level", 
        ylab = "Spending (in $)", 
        col = "lightblue")

2.3 Analyzing Ordinal Variables

Frequency Table

# Frequency table for satisfaction
table(satisfaction)

satisfaction
Very Dissatisfied      Dissatisfied           Neutral         Satisfied 
                0                 1                 2                 2 
   Very Satisfied 
                1

Summary Statistics for Ordinal Variables

Although ordinal variables are not numeric, you can explore their distribution:

# Summarize ordinal data
summary(satisfaction)

Very Dissatisfied      Dissatisfied           Neutral         Satisfied 
                0                 1                 2                 2 
   Very Satisfied 
                1

2.4 Testing Ordinal Variables

# Create a data frame with ordinal variables
survey_data <- data.frame(
  ID = 1:6,
  Education = factor(c("High School", "Bachelor's", "Master's", 
                       "Ph.D.", "High School", "Bachelor's"),
                     levels = c("High School", "Bachelor's", "Master's", "Ph.D."),
                     ordered = TRUE),
  Satisfaction = factor(c("Neutral", "Satisfied", "Very Satisfied", 
                          "Dissatisfied", "Neutral", "Satisfied"),
                        levels = c("Very Dissatisfied", "Dissatisfied", 
                                   "Neutral", "Satisfied", "Very Satisfied"),
                        ordered = TRUE),
  Spending = c(150, 200, 500, 120, 180, 250)
)

# Print the dataset
print(survey_data)

  ID   Education   Satisfaction Spending
1  1 High School        Neutral      150
2  2  Bachelor's      Satisfied      200
3  3    Master's Very Satisfied      500
4  4       Ph.D.   Dissatisfied      120
5  5 High School        Neutral      180
6  6  Bachelor's      Satisfied      250

Compare Groups

Ordinal variables can be used to group and compare other variables.

# Mean spending by education level
aggregate(Spending ~ Education, data = survey_data, FUN = mean)

    Education Spending
1 High School      165
2  Bachelor's      225
3    Master's      500
4       Ph.D.      120

Check Correlation

While ordinal variables are categorical, they can sometimes be treated as numeric for simple correlation checks.

# Convert satisfaction to numeric and check correlation
cor(as.numeric(survey_data$Satisfaction), survey_data$Spending)

[1] 0.8709492

3 Interval Variables

3.1 Creating Interval Variables

Temperature in Celsius

# Create a vector for temperature
temperature <- c(20, 15, 30, 25, 10, 18)

# Print the variable
print(temperature)

[1] 20 15 30 25 10 18

# Check the structure
str(temperature)

 num [1:6] 20 15 30 25 10 18

IQ Scores

# Create a vector for IQ scores
iq_scores <- c(110, 95, 120, 130, 105, 115)

# Print the variable
print(iq_scores)

[1] 110  95 120 130 105 115

Dates

Dates in interval form represent the time elapsed (e.g., days, months, years).

# Create a vector for dates
dates <- as.Date(c("2024-01-01", "2024-01-10", "2024-01-15", 
                   "2024-02-01", "2024-02-15"))

# Calculate intervals (difference in days)
date_intervals <- diff(dates)
print(date_intervals)

Time differences in days
[1]  9  5 17 14

3.2 Analyzing Interval Variables

Summary Statistics

# Summary statistics for temperature
summary(temperature)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  10.00   15.75   19.00   19.67   23.75   30.00

# Summary statistics for IQ scores
summary(iq_scores)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   95.0   106.2   112.5   112.5   118.8   130.0

Calculating Differences

Since interval variables allow meaningful differences, you can calculate and interpret these:

# Difference in temperature
temperature_diff <- diff(temperature)
print(temperature_diff)

[1]  -5  15  -5 -15   8

3.3 Visualizing Interval Variables

Histogram

A histogram shows the distribution of interval data.

# Histogram for temperature
hist(temperature, 
     main = "Temperature Distribution", 
     xlab = "Temperature (°C)", 
     col = "lightblue", 
     breaks = 5)

Line Plot

If the data has a time component, a line plot is useful.

# Line plot for temperature
plot(temperature, 
     type = "o", 
     main = "Temperature Trend", 
     xlab = "Day", 
     ylab = "Temperature (°C)", 
     col = "blue", 
     pch = 16)

3.4 Working with Dates as Interval Data

Calculating Differences Between Dates

# Calculate the interval in days
date_diff <- as.numeric(diff(dates))
print(date_diff)

[1]  9  5 17 14

Plotting Dates

# Create a line plot with dates
plot(dates[-1], cumsum(date_diff), 
     type = "o", 
     main = "Cumulative Days Over Time", 
     xlab = "Dates", 
     ylab = "Cumulative Days", 
     col = "green", 
     pch = 16)

3.5 Testing Interval Variables

# Create a dataset
interval_data <- data.frame(
  ID = 1:6,
  Temperature = c(20, 15, 30, 25, 10, 18),      # Interval variable
  IQ_Score = c(110, 95, 120, 130, 105, 115),   # Interval variable
  Date = as.Date(c("2024-01-01", "2024-01-10", "2024-01-15", 
                   "2024-02-01", "2024-02-15", "2024-03-01"))
)

# Print the dataset
print(interval_data)

  ID Temperature IQ_Score       Date
1  1          20      110 2024-01-01
2  2          15       95 2024-01-10
3  3          30      120 2024-01-15
4  4          25      130 2024-02-01
5  5          10      105 2024-02-15
6  6          18      115 2024-03-01

Calculate Mean and Standard Deviation

# Mean and SD for temperature
mean_temp <- mean(interval_data$Temperature)
sd_temp <- sd(interval_data$Temperature)

print(mean_temp)

[1] 19.66667

print(sd_temp)

[1] 7.118052

Correlation Between Two Interval Variables

# Correlation between Temperature and IQ Score
cor(interval_data$Temperature, interval_data$IQ_Score)

[1] 0.7403257

4 Ratio Variables

Ratio variables are numerical variables where both differences and ratios between values are meaningful, and there is a true zero point.

4.1 Creating Ratio Variables

Example: Weight (in kg)

# Create a vector for weight
weight <- c(60, 70, 55, 80, 90, 65)

# Print the variable
print(weight)

[1] 60 70 55 80 90 65

# Check the structure
str(weight)

 num [1:6] 60 70 55 80 90 65

Example: Distance Traveled (in km)

# Create a vector for distance
distance <- c(10, 20, 5, 15, 25, 30)

# Print the variable
print(distance)

[1] 10 20  5 15 25 30

Example: Income (in INR)

# Create a vector for income
income <- c(25000, 40000, 30000, 45000, 50000, 35000)

# Print the variable
print(income)

[1] 25000 40000 30000 45000 50000 35000

4.2 Analyzing Ratio Variables

Summary Statistics

# Summary statistics for weight
summary(weight)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  55.00   61.25   67.50   70.00   77.50   90.00

# Summary statistics for distance
summary(distance)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   5.00   11.25   17.50   17.50   23.75   30.00

Calculating Ratios

Ratios between two values are meaningful for ratio variables.

# Ratio of the highest weight to the lowest weight
max(weight) / min(weight)

[1] 1.636364

# Ratio of the highest income to the lowest income
max(income) / min(income)

[1] 2

Checking Proportional Relationships

# Proportion of distances to the total distance
distance_prop <- distance / sum(distance)
print(distance_prop)

[1] 0.09523810 0.19047619 0.04761905 0.14285714 0.23809524 0.28571429

4.3 Visualizing Ratio Variables

Histogram

A histogram helps visualize the distribution of ratio variables.

# Histogram for weight
hist(weight, 
     main = "Weight Distribution", 
     xlab = "Weight (kg)", 
     col = "lightblue", 
     breaks = 5)

Box Plot

Box plots are useful to show the range and outliers in ratio data.

# Box plot for income
boxplot(income, 
        main = "Income Distribution", 
        ylab = "Income (INR)", 
        col = "pink")

Scatter Plot

A scatter plot can show relationships between two ratio variables.

# Scatter plot of weight vs. distance
plot(weight, distance, 
     main = "Weight vs. Distance Traveled", 
     xlab = "Weight (kg)", 
     ylab = "Distance Traveled (km)", 
     col = "blue", 
     pch = 16)

4.4 Testing Ratio Variables

Dataset

# Create a dataset with ratio variables
ratio_data <- data.frame(
  ID = 1:6,
  Weight = weight,                    # Ratio variable
  Distance = distance,                # Ratio variable
  Income = income,                    # Ratio variable
  Time = c(2, 4, 1, 3, 5, 6)          # Ratio variable (time in hours)
)

# Print the dataset
print(ratio_data)

  ID Weight Distance Income Time
1  1     60       10  25000    2
2  2     70       20  40000    4
3  3     55        5  30000    1
4  4     80       15  45000    3
5  5     90       25  50000    5
6  6     65       30  35000    6

Mean and Standard Deviation

# Mean and SD for weight
mean_weight <- mean(ratio_data$Weight)
sd_weight <- sd(ratio_data$Weight)

print(mean_weight)

[1] 70

print(sd_weight)

[1] 13.0384

Correlation Between Two Ratio Variables

# Correlation between Weight and Distance
cor(ratio_data$Weight, ratio_data$Distance)

[1] 0.532948

# Correlation between Income and Time
cor(ratio_data$Income, ratio_data$Time)

[1] 0.5428571

Proportional Comparisons

# Proportion of each person's income to the total income
income_prop <- ratio_data$Income / sum(ratio_data$Income)
print(income_prop)

[1] 0.1111111 0.1777778 0.1333333 0.2000000 0.2222222 0.1555556

4.5 Visualizing Ratio Relationships

Bar Plot for Proportions

# Bar plot for income proportions
barplot(income_prop, 
        main = "Income Proportions", 
        names.arg = ratio_data$ID, 
        xlab = "ID", 
        ylab = "Proportion of Total Income", 
        col = "cyan")

Line Plot for Trend

# Line plot of time vs. distance
plot(ratio_data$Time, ratio_data$Distance, 
     type = "o", 
     main = "Distance Traveled Over Time", 
     xlab = "Time (hours)", 
     ylab = "Distance (km)", 
     col = "green", 
     pch = 16)