📦 Data Overview
# Load flattened data by level
PUS_counts_level1 <- read.csv('PUS_counts_level1.csv')
PUS_counts_level2 <- read.csv('PUS_counts_level2.csv')
PUS_counts_level3 <- read.csv('PUS_counts_level3.csv')
PUS_counts_level4 <- read.csv('PUS_counts_level4.csv')
Positives_level1 <- read.csv('Positive_marks_level1.csv')
Positives_level2 <- read.csv('Positive_marks_level2.csv')
Positives_level3 <- read.csv('Positive_marks_level3.csv')
Positives_level4 <- read.csv('Positive_marks_level4.csv')
PUS_counts_out <- rbind(PUS_counts_level2,PUS_counts_level3,PUS_counts_level4,PUS_counts_level1)
Positives_marks <- rbind(Positives_level2,Positives_level3,Positives_level4,Positives_level1)
# Show structure
head(PUS_counts_out)
## X subject_ids user_name class_id variable value
## 1 1 46488482 Jazmineromo89 571864993 Grouper, All Species 19
## 2 2 47498581 Jazmineromo89 571865022 Grouper, All Species 33
## 3 3 46630512 Jazmineromo89 571865043 Grouper, All Species 35
## 4 4 100577061 Jazmineromo89 571865049 Grouper, All Species 0
## 5 5 100577038 Jazmineromo89 571865062 Grouper, All Species 0
## 6 6 47498604 Jazmineromo89 571865097 Grouper, All Species 53
🧪 Flattened Data Summary and Visulization
# count the number of unique users per photo per species
users_per_photo <- PUS_counts_out %>%
select(subject_ids, user_name, variable) %>%
distinct() %>%
group_by(subject_ids, variable) %>%
summarise(n_users = n_distinct(user_name), .groups = "drop")
#range(users_per_photo$n_users)
ggplot(users_per_photo, aes(x = factor(n_users))) +
geom_bar(fill = "skyblue", color = "black") +
labs(
title = "Number of Users per photo per species",
x = "Number of Users",
y = "Number of Photos"
) +
theme_minimal()

# Count unique photos per user
photos_per_user <- PUS_counts_out %>%
select(user_name, subject_ids) %>%
distinct() %>% # ensure each photo is only counted once per user
group_by(user_name) %>%
summarise(n_photos = n(), .groups = "drop") # count unique photos per user
ggplot(photos_per_user, aes(x = n_photos)) +
geom_histogram(binwidth = 0.1, fill = "skyblue", color = "black") +
scale_x_log10() +
labs(
title = "Histogram of Photos Reviewed per User (Log Scale)",
x = "Number of Photos Reviewed (log10 scale)",
y = "Number of Users"
) +
theme_minimal()

✅ Agreement among Users
# Summarize agreement per species per photo
agreement_stats <- PUS_counts_out %>%
group_by(subject_ids, variable) %>%
summarise(
mean_count = mean(value, na.rm = TRUE),
median_count = median(value, na.rm = TRUE),
sd_count = sd(value, na.rm = TRUE),
iqr_count = IQR(value, na.rm = TRUE),
min_count = min(value),
max_count = max(value),
range_count = max(value) - min(value),
n_users = n(),
.groups = "drop"
)
#### Add mode and agreement count/percentages
# Calculate total users per photo–species
total_users <- PUS_counts_out %>%
distinct(subject_ids, variable, user_name) %>%
group_by(subject_ids, variable) %>%
summarise(total_users = n(), .groups = "drop")
# Calculate mode and how many users selected it
mode_summary <- PUS_counts_out %>%
group_by(subject_ids, variable, value) %>%
summarise(n_users = n(), .groups = "drop") %>%
group_by(subject_ids, variable) %>%
slice_max(n_users, n = 1, with_ties = FALSE) %>% # Get most common value
rename(mode_value = value, mode_count = n_users)
# Merge and compute agreement %
mode_agreement_summary <- left_join(mode_summary, total_users, by = c("subject_ids", "variable")) %>%
mutate(agreement_percent = (mode_count / total_users) * 100) %>%
mutate(response_type = ifelse(mode_value == 0, "Zero", "Positive"))
# Visualize — Histogram of Agreement Percent by species
species_stats <- mode_agreement_summary %>%
group_by(variable) %>%
summarise(
mean_agreement = mean(agreement_percent, na.rm = TRUE),
sd_agreement = sd(agreement_percent, na.rm = TRUE),
.groups = "drop"
) %>%
mutate(
label = paste0("Mean ± SD:\n", round(mean_agreement, 1), " ± ", round(sd_agreement, 1), "%")
)
# Reorder variable so "Fish" and "People" are last
mode_agreement_summary$variable <- forcats::fct_relevel(
mode_agreement_summary$variable,
setdiff(unique(mode_agreement_summary$variable), c("Fish", "People")),
"Fish", "People"
)
# Also reorder species_stats to match (if used in facet-related geoms)
species_stats$variable <- factor(
species_stats$variable,
levels = levels(mode_agreement_summary$variable)
)
ggplot(mode_agreement_summary, aes(x = agreement_percent, fill = response_type)) +
geom_histogram(
binwidth = 5,
color = "black",
alpha = 0.4,
position = "identity"
) +
geom_vline(
data = species_stats,
aes(xintercept = mean_agreement),
color = "darkblue",
linetype = "dashed",
size = 0.8
) +
facet_wrap(~ variable, scales = "free_y") +
geom_text(
data = species_stats,
aes(x = 5, y = Inf, label = label),
vjust = 1.2, hjust = 0,
size = 3.5,
inherit.aes = FALSE
) +
labs(
title = "User Agreement on Species Count per Photo",
x = "Agreement Percentage (mode count / total users)",
y = "Number of Photos",
fill = "Count Type"
) +
scale_x_continuous(breaks = seq(0, 100, 10)) +
theme_minimal() +
theme(
strip.text = element_text(face = "bold"),
legend.position = "bottom"
)

species_stats2 <- mode_agreement_summary %>%
group_by(variable, response_type) %>%
summarise(
mean_agreement = mean(agreement_percent, na.rm = TRUE),
sd_agreement = sd(agreement_percent, na.rm = TRUE),
.groups = "drop"
) %>%
mutate(
label = paste0(round(mean_agreement, 1), " ± ", round(sd_agreement, 1), "%")
)
print(species_stats2)
## # A tibble: 24 × 5
## variable response_type mean_agreement sd_agreement label
## <fct> <chr> <dbl> <dbl> <chr>
## 1 Amberjack Positive 46.5 16.7 46.5 ± 16.7%
## 2 Amberjack Zero 64.1 21.9 64.1 ± 21.9%
## 3 Red Snapper Positive 43.8 18.6 43.8 ± 18.6%
## 4 Red Snapper Zero 57.8 27.1 57.8 ± 27.1%
## 5 Shark Positive 64.6 16.5 64.6 ± 16.5%
## 6 Shark Zero 93.1 12.7 93.1 ± 12.7%
## 7 Cobia Positive 78.4 22.0 78.4 ± 22%
## 8 Cobia Zero 88.6 13.2 88.6 ± 13.2%
## 9 Dolphin Positive 59.4 19.9 59.4 ± 19.9%
## 10 Dolphin Zero 87.3 16.4 87.3 ± 16.4%
## # ℹ 14 more rows
ggplot(species_stats2, aes(x = response_type, y = mean_agreement, fill = response_type)) +
geom_col(position = "dodge", width = 0.6) +
geom_errorbar(
aes(ymin = mean_agreement - sd_agreement, ymax = mean_agreement + sd_agreement),
width = 0.2,
position = position_dodge(width = 0.6)
) +
geom_text(
aes(label = label),
position = position_dodge(width = 0.6),
vjust = -0.5, # adjust for spacing above the bar
size = 3
) +
facet_wrap(~ variable) +
scale_fill_manual(
values = c("Positive" = "#1b9e77", "Zero" = "#d9d9d9") # custom colors
) +
labs(
title = "Mean ± SD of Agreement Percent by Species and Response Type",
x = "Response Type",
y = "Agreement Percentage"
) +
theme_minimal() +
theme(strip.text = element_text(face = "bold"))

# Merge the stats
agreement_stats_more <- left_join(agreement_stats, mode_agreement_summary,
by = c("subject_ids", "variable"))
# Visualize variability across species
ggplot(agreement_stats_more, aes(x = variable, y = sd_count)) +
geom_boxplot(fill = "lightblue") +
labs(
title = "Variation in Count Estimates Across Species",
x = "Species",
y = "Standard Deviation of User Counts"
) +
theme_minimal() +
coord_flip()

####################################################################
photo_level_counts_long <- agreement_stats_more %>%
select(subject_ids, variable, mean_count, median_count, mode_value) %>%
pivot_longer(
cols = c(mean_count, median_count, mode_value),
names_to = "statistic",
values_to = "count"
)
ggplot(photo_level_counts_long, aes(x = statistic, y = count, fill = statistic)) +
geom_boxplot(alpha = 0.7, outlier.size = 0.5) +
facet_wrap(~ variable, scales = "free_y") +
labs(
title = "Comparison of Mean, Median, and Mode Counts by Species (Photo Level)",
x = "Statistic",
y = "Count"
) +
theme_minimal() +
theme(legend.position = "none")

📊 DBSCAN Clustering Analysis
# Parameters for DBSCAN
eps <- 10 # Distance threshold in coordinate units
minPts <- 3 # Minimum number of marks to form a cluster
# Apply DBSCAN clustering by subject_ids and tool_label
clustered_data <- Positives_marks %>%
filter(!is.na(xcoord), !is.na(ycoord), !is.na(tool_label)) %>%
group_by(subject_ids, tool_label) %>%
group_split() %>%
map_dfr(function(df) {
coords <- df %>% select(xcoord, ycoord)
# Handle cases with too few marks to form a cluster
if (nrow(coords) < minPts) {
df$cluster_id <- NA
return(df)
}
# Run DBSCAN
clustering <- dbscan(coords, eps = eps, minPts = minPts)
# Add cluster ID (0 = noise)
df$cluster_id <- clustering$cluster
df
})
# Count Number of agreement clusters per photo and species
# how many unique users contributed marks to each cluster
cluster_summary <- clustered_data %>%
filter(!is.na(cluster_id), cluster_id > 0) %>% # Keep valid clusters only (exclude noise)
group_by(subject_ids, tool_label, cluster_id) %>%
summarise(
n_users = n_distinct(user_name), # Number of unique users per cluster
.groups = "drop"
)
# Filter clusters with strong agreement (e.g., ≥ 50% agreement)
user_totals <- clustered_data %>%
distinct(subject_ids, tool_label, user_name) %>%
count(subject_ids, tool_label, name = "total_users")
high_confidence_clusters <- cluster_summary %>%
left_join(user_totals, by = c("subject_ids", "tool_label")) %>%
mutate(user_agreement = n_users / total_users) %>%
filter(user_agreement >= 0.5) # keep clusters marked by ≥50% of users
# How many strong clusters per photo/species?
n_cluster_high <- high_confidence_clusters %>%
group_by(subject_ids, tool_label) %>% # per photo and species
summarise(
n_high_conf_clusters = n_distinct(cluster_id), # how many clusters met ≥ 50% agreement
max_agreement_users = max(n_users), # largest number of users agreeing on any single cluster
.groups = "drop"
) %>%
arrange(desc(max_agreement_users)) # rank by strength of top cluster
high_confidence_clusters %>%
group_by(subject_ids, tool_label) %>%
summarise(n_high_conf_clusters = n_distinct(cluster_id), .groups = "drop") %>%
ggplot(aes(x = fct_reorder(tool_label, n_high_conf_clusters, .fun = median),
y = n_high_conf_clusters,
fill = tool_label)) +
geom_boxplot() +
scale_fill_viridis_d(guide = "none") +
labs(
title = "Distribution of High-Confidence Clusters per Species",
x = "Species (ordered by median # clusters)",
y = "# High-Confidence Clusters (≥50% user agreement)"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Total clusters per photo and species
total_clusters <- clustered_data %>%
filter(!is.na(cluster_id), cluster_id > 0) %>%
distinct(subject_ids, tool_label, cluster_id) %>%
count(subject_ids, tool_label, name = "total_clusters")
cluster_proportions <- left_join(n_cluster_high, total_clusters,
by = c("subject_ids", "tool_label")) %>%
mutate(prop_high_conf_clusters = n_high_conf_clusters / total_clusters)
ggplot(cluster_proportions, aes(x = tool_label, y = prop_high_conf_clusters)) +
geom_boxplot(fill = "#4daf4a", alpha = 0.7) +
labs(
title = "Proportion of High-Confidence Clusters per Species",
x = "Species",
y = "Proportion of Clusters (≥50% user agreement)"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

##### compare mode estimate with number of high-confidence cluster
mode_positive <- mode_agreement_summary %>%
filter(mode_value > 0)
names(mode_positive)[2] <- 'tool_label'
compare_data <- left_join(n_cluster_high, mode_positive,
by = c("subject_ids", "tool_label"))
compare_data_clean <- compare_data %>%
filter(!is.na(n_high_conf_clusters), !is.na(mode_value))
ggplot(compare_data, aes(x = mode_count, y = n_high_conf_clusters)) +
geom_point(alpha = 0.7, size = 2.5, color = "#1f78b4") +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "gray30") +
facet_wrap(~ tool_label) +
labs(
title = "High-Confidence Clusters vs. Mode Count",
x = "Mode Count",
y = "# High-Confidence Clusters"
) +
theme_minimal() +
theme(strip.text = element_text(face = "bold"))
