Zooniverse Analysis

📦 Data Overview

# Load flattened data by level
PUS_counts_level1 <- read.csv('PUS_counts_level1.csv')
PUS_counts_level2 <- read.csv('PUS_counts_level2.csv')
PUS_counts_level3 <- read.csv('PUS_counts_level3.csv')
PUS_counts_level4 <- read.csv('PUS_counts_level4.csv')
Positives_level1 <- read.csv('Positive_marks_level1.csv')
Positives_level2 <- read.csv('Positive_marks_level2.csv')
Positives_level3 <- read.csv('Positive_marks_level3.csv')
Positives_level4 <- read.csv('Positive_marks_level4.csv')

PUS_counts_out <- rbind(PUS_counts_level2,PUS_counts_level3,PUS_counts_level4,PUS_counts_level1)
Positives_marks <- rbind(Positives_level2,Positives_level3,Positives_level4,Positives_level1)


# Show structure
head(PUS_counts_out)

##   X subject_ids     user_name  class_id             variable value
## 1 1    46488482 Jazmineromo89 571864993 Grouper, All Species    19
## 2 2    47498581 Jazmineromo89 571865022 Grouper, All Species    33
## 3 3    46630512 Jazmineromo89 571865043 Grouper, All Species    35
## 4 4   100577061 Jazmineromo89 571865049 Grouper, All Species     0
## 5 5   100577038 Jazmineromo89 571865062 Grouper, All Species     0
## 6 6    47498604 Jazmineromo89 571865097 Grouper, All Species    53

🧪 Flattened Data Summary and Visulization

# count the number of unique users per photo per species
users_per_photo <- PUS_counts_out %>%
  select(subject_ids, user_name, variable) %>%
  distinct() %>%
  group_by(subject_ids, variable) %>%
  summarise(n_users = n_distinct(user_name), .groups = "drop")

#range(users_per_photo$n_users)

ggplot(users_per_photo, aes(x = factor(n_users))) +
  geom_bar(fill = "skyblue", color = "black") +
  labs(
    title = "Number of Users per photo per species",
    x = "Number of Users",
    y = "Number of Photos"
  ) +
  theme_minimal()

# Count unique photos per user
photos_per_user <- PUS_counts_out %>%
  select(user_name, subject_ids) %>%
  distinct() %>%                          # ensure each photo is only counted once per user
  group_by(user_name) %>%
  summarise(n_photos = n(), .groups = "drop")  # count unique photos per user

ggplot(photos_per_user, aes(x = n_photos)) +
  geom_histogram(binwidth = 0.1, fill = "skyblue", color = "black") +
  scale_x_log10() +
  labs(
    title = "Histogram of Photos Reviewed per User (Log Scale)",
    x = "Number of Photos Reviewed (log10 scale)",
    y = "Number of Users"
  ) +
  theme_minimal()

✅ Agreement among Users

# Summarize agreement per species per photo

agreement_stats <- PUS_counts_out %>%
  group_by(subject_ids, variable) %>%
  summarise(
    mean_count = mean(value, na.rm = TRUE),
    median_count = median(value, na.rm = TRUE),
    sd_count = sd(value, na.rm = TRUE),
    iqr_count = IQR(value, na.rm = TRUE),
    min_count = min(value),
    max_count = max(value),
    range_count = max(value) - min(value),
    n_users = n(),
    .groups = "drop"
  )
#### Add mode and agreement count/percentages

# Calculate total users per photo–species
total_users <- PUS_counts_out %>%
  distinct(subject_ids, variable, user_name) %>%
  group_by(subject_ids, variable) %>%
  summarise(total_users = n(), .groups = "drop")

# Calculate mode and how many users selected it
mode_summary <- PUS_counts_out %>%
  group_by(subject_ids, variable, value) %>%
  summarise(n_users = n(), .groups = "drop") %>%
  group_by(subject_ids, variable) %>%
  slice_max(n_users, n = 1, with_ties = FALSE) %>%  # Get most common value
  rename(mode_value = value, mode_count = n_users)

# Merge and compute agreement %
mode_agreement_summary <- left_join(mode_summary, total_users, by = c("subject_ids", "variable")) %>%
  mutate(agreement_percent = (mode_count / total_users) * 100) %>%
  mutate(response_type = ifelse(mode_value == 0, "Zero", "Positive"))

# Visualize — Histogram of Agreement Percent by species

species_stats <- mode_agreement_summary %>%
  group_by(variable) %>%
  summarise(
    mean_agreement = mean(agreement_percent, na.rm = TRUE),
    sd_agreement = sd(agreement_percent, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  mutate(
    label = paste0("Mean ± SD:\n", round(mean_agreement, 1), " ± ", round(sd_agreement, 1), "%")
  )

# Reorder variable so "Fish" and "People" are last
mode_agreement_summary$variable <- forcats::fct_relevel(
  mode_agreement_summary$variable,
  setdiff(unique(mode_agreement_summary$variable), c("Fish", "People")),
  "Fish", "People"
)
# Also reorder species_stats to match (if used in facet-related geoms)
species_stats$variable <- factor(
  species_stats$variable,
  levels = levels(mode_agreement_summary$variable)
)

ggplot(mode_agreement_summary, aes(x = agreement_percent, fill = response_type)) +
  geom_histogram(
    binwidth = 5, 
    color = "black", 
    alpha = 0.4, 
    position = "identity"
  ) +
  geom_vline(
    data = species_stats,
    aes(xintercept = mean_agreement),
    color = "darkblue", 
    linetype = "dashed", 
    size = 0.8
  ) +
  facet_wrap(~ variable, scales = "free_y") +
  geom_text(
    data = species_stats,
    aes(x = 5, y = Inf, label = label),
    vjust = 1.2, hjust = 0,
    size = 3.5,
    inherit.aes = FALSE
  ) +
  labs(
    title = "User Agreement on Species Count per Photo",
    x = "Agreement Percentage (mode count / total users)",
    y = "Number of Photos",
    fill = "Count Type"
  ) +
  scale_x_continuous(breaks = seq(0, 100, 10)) +
  theme_minimal() +
  theme(
    strip.text = element_text(face = "bold"),
    legend.position = "bottom"
  )

species_stats2 <- mode_agreement_summary %>%
  group_by(variable, response_type) %>%
  summarise(
    mean_agreement = mean(agreement_percent, na.rm = TRUE),
    sd_agreement = sd(agreement_percent, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  mutate(
    label = paste0(round(mean_agreement, 1), " ± ", round(sd_agreement, 1), "%")
  )
print(species_stats2)

## # A tibble: 24 × 5
##    variable    response_type mean_agreement sd_agreement label       
##    <fct>       <chr>                  <dbl>        <dbl> <chr>       
##  1 Amberjack   Positive                46.5         16.7 46.5 ± 16.7%
##  2 Amberjack   Zero                    64.1         21.9 64.1 ± 21.9%
##  3 Red Snapper Positive                43.8         18.6 43.8 ± 18.6%
##  4 Red Snapper Zero                    57.8         27.1 57.8 ± 27.1%
##  5 Shark       Positive                64.6         16.5 64.6 ± 16.5%
##  6 Shark       Zero                    93.1         12.7 93.1 ± 12.7%
##  7 Cobia       Positive                78.4         22.0 78.4 ± 22%  
##  8 Cobia       Zero                    88.6         13.2 88.6 ± 13.2%
##  9 Dolphin     Positive                59.4         19.9 59.4 ± 19.9%
## 10 Dolphin     Zero                    87.3         16.4 87.3 ± 16.4%
## # ℹ 14 more rows

ggplot(species_stats2, aes(x = response_type, y = mean_agreement, fill = response_type)) +
  geom_col(position = "dodge", width = 0.6) +
  geom_errorbar(
    aes(ymin = mean_agreement - sd_agreement, ymax = mean_agreement + sd_agreement),
    width = 0.2,
    position = position_dodge(width = 0.6)
  ) +
  geom_text(
    aes(label = label),
    position = position_dodge(width = 0.6),
    vjust = -0.5,  # adjust for spacing above the bar
    size = 3
  ) +
  facet_wrap(~ variable) +
  scale_fill_manual(
    values = c("Positive" = "#1b9e77", "Zero" = "#d9d9d9")  # custom colors
  ) +
  labs(
    title = "Mean ± SD of Agreement Percent by Species and Response Type",
    x = "Response Type",
    y = "Agreement Percentage"
  ) +
  theme_minimal() +
  theme(strip.text = element_text(face = "bold"))

# Merge the stats 
agreement_stats_more <- left_join(agreement_stats, mode_agreement_summary,
                                  by = c("subject_ids", "variable"))
# Visualize variability across species
ggplot(agreement_stats_more, aes(x = variable, y = sd_count)) +
  geom_boxplot(fill = "lightblue") +
  labs(
    title = "Variation in Count Estimates Across Species",
    x = "Species",
    y = "Standard Deviation of User Counts"
  ) +
  theme_minimal() +
  coord_flip()

####################################################################
photo_level_counts_long <- agreement_stats_more %>%
  select(subject_ids, variable, mean_count, median_count, mode_value) %>%
  pivot_longer(
    cols = c(mean_count, median_count, mode_value),
    names_to = "statistic",
    values_to = "count"
  )
ggplot(photo_level_counts_long, aes(x = statistic, y = count, fill = statistic)) +
  geom_boxplot(alpha = 0.7, outlier.size = 0.5) +
  facet_wrap(~ variable, scales = "free_y") +
  labs(
    title = "Comparison of Mean, Median, and Mode Counts by Species (Photo Level)",
    x = "Statistic",
    y = "Count"
  ) +
  theme_minimal() +
  theme(legend.position = "none")

📊 DBSCAN Clustering Analysis

# Parameters for DBSCAN
eps <- 10      # Distance threshold in coordinate units
minPts <- 3    # Minimum number of marks to form a cluster

# Apply DBSCAN clustering by subject_ids and tool_label
clustered_data <- Positives_marks %>%
  filter(!is.na(xcoord), !is.na(ycoord), !is.na(tool_label)) %>%
  group_by(subject_ids, tool_label) %>%
  group_split() %>%
  map_dfr(function(df) {
    coords <- df %>% select(xcoord, ycoord)
    
    # Handle cases with too few marks to form a cluster
    if (nrow(coords) < minPts) {
      df$cluster_id <- NA
      return(df)
    }
    # Run DBSCAN
    clustering <- dbscan(coords, eps = eps, minPts = minPts)
    # Add cluster ID (0 = noise)
    df$cluster_id <- clustering$cluster
    df
  })

# Count Number of agreement clusters per photo and species
# how many unique users contributed marks to each cluster
cluster_summary <- clustered_data %>%
  filter(!is.na(cluster_id), cluster_id > 0) %>%  # Keep valid clusters only (exclude noise)
  group_by(subject_ids, tool_label, cluster_id) %>%
  summarise(
    n_users = n_distinct(user_name),              # Number of unique users per cluster
    .groups = "drop"
  )

# Filter clusters with strong agreement (e.g., ≥ 50% agreement)
user_totals <- clustered_data %>%
  distinct(subject_ids, tool_label, user_name) %>%
  count(subject_ids, tool_label, name = "total_users")

high_confidence_clusters <- cluster_summary %>%
  left_join(user_totals, by = c("subject_ids", "tool_label")) %>%
  mutate(user_agreement = n_users / total_users) %>%
  filter(user_agreement >= 0.5)  # keep clusters marked by ≥50% of users

# How many strong clusters per photo/species?
n_cluster_high <- high_confidence_clusters %>%
  group_by(subject_ids, tool_label) %>%     # per photo and species
  summarise(
    n_high_conf_clusters = n_distinct(cluster_id),   # how many clusters met ≥ 50% agreement
    max_agreement_users = max(n_users),              # largest number of users agreeing on any single cluster
    .groups = "drop"
  ) %>%
  arrange(desc(max_agreement_users))        # rank by strength of top cluster

high_confidence_clusters %>%
  group_by(subject_ids, tool_label) %>%
  summarise(n_high_conf_clusters = n_distinct(cluster_id), .groups = "drop") %>%
  ggplot(aes(x = fct_reorder(tool_label, n_high_conf_clusters, .fun = median), 
             y = n_high_conf_clusters, 
             fill = tool_label)) +
  geom_boxplot() +
  scale_fill_viridis_d(guide = "none") +
  labs(
    title = "Distribution of High-Confidence Clusters per Species",
    x = "Species (ordered by median # clusters)",
    y = "# High-Confidence Clusters (≥50% user agreement)"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Total clusters per photo and species
total_clusters <- clustered_data %>%
  filter(!is.na(cluster_id), cluster_id > 0) %>%
  distinct(subject_ids, tool_label, cluster_id) %>%
  count(subject_ids, tool_label, name = "total_clusters")

cluster_proportions <- left_join(n_cluster_high, total_clusters,
                                 by = c("subject_ids", "tool_label")) %>%
  mutate(prop_high_conf_clusters = n_high_conf_clusters / total_clusters)

ggplot(cluster_proportions, aes(x = tool_label, y = prop_high_conf_clusters)) +
  geom_boxplot(fill = "#4daf4a", alpha = 0.7) +
  labs(
    title = "Proportion of High-Confidence Clusters per Species",
    x = "Species",
    y = "Proportion of Clusters (≥50% user agreement)"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

##### compare mode estimate with number of high-confidence cluster

mode_positive <- mode_agreement_summary %>%
  filter(mode_value > 0)
names(mode_positive)[2] <- 'tool_label'
  
compare_data <- left_join(n_cluster_high, mode_positive,
                          by = c("subject_ids", "tool_label"))

compare_data_clean <- compare_data %>%
  filter(!is.na(n_high_conf_clusters), !is.na(mode_value))

ggplot(compare_data, aes(x = mode_count, y = n_high_conf_clusters)) +
  geom_point(alpha = 0.7, size = 2.5, color = "#1f78b4") +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "gray30") +
  facet_wrap(~ tool_label) +
  labs(
    title = "High-Confidence Clusters vs. Mode Count",
    x = "Mode Count",
    y = "# High-Confidence Clusters"
  ) +
  theme_minimal() +
  theme(strip.text = element_text(face = "bold"))

Zooniverse Analysis

Jie Cao

2025-07-03

📦 Data Overview

🧪 Flattened Data Summary and Visulization

✅ Agreement among Users

📊 DBSCAN Clustering Analysis