A data quality assessment of statewide deer monitoring data
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>"
)
# library(deervic)
library(sf)
library(DBI)
library(dplyr)
library(pointblank)
# Postgres connection: better for rasters (and easier to set up)
con <- RPostgreSQL::dbConnect(RPostgres::Postgres(),
host = '10.110.7.201',
dbname = 'ari-dev-weda-psql-01',
user = "psql_admin",
password = keyring::key_get(service = "ari-dev-weda-psql-01", username = "psql_admin"),
port = 5432,
service = NULL,
list(sslmode = "require"))
curated_final_site_selection <- st_read(dsn = con,
layer = Id(schema = "deervic", table = "curated_final_site_selection"))
curated_site_data <- tbl(con, dbplyr::in_schema(schema = "deervic", table = "curated_site_data")) %>%
collect()
curated_site_retrieval_data <- tbl(con, dbplyr::in_schema(schema = "deervic", table = "curated_site_retrieval_data")) %>%
collect()
data_dictionary <- tbl(con, dbplyr::in_schema(schema = "deervic", table = "data_dictionary")) %>%
collect()
all_sites <- tbl(con, dbplyr::in_schema(schema = "deervic", table = "raw_sampling_grid")) %>%
dplyr::select(ID) %>%
collect() %>%
pull()
# Select deer species of interest
deer_species <- c("Dama dama", "Rusa unicolor")
raw_camtrap_records <- tbl(con, dbplyr::in_schema(schema = "deervic", table = "raw_camtrap_records")) %>%
filter(Species %in% !!deer_species) %>%
collect()
The following data quality statement is sourced from data relating to the initial site assessment and camera deployment. This data quality statement can help identify issues relating to the deployment of camera traps and site assessments such is unequal sampling/search area (transects) or data entry issues.
site_agent <-
create_agent(
tbl = curated_site_data,
tbl_name = "curated_site_data",
label = "VALID-I Curated Site Data"
) %>% # Site ID
col_vals_in_set(vars(SiteID), set = c(curated_final_site_selection$ID)) %>%
col_vals_between(vars(SiteID), left = 0, right = max(all_sites)) %>%
rows_distinct(columns = vars(SiteID, Iteration)) %>%
# Personel
col_vals_not_null(vars(User_Name, Personnel)) %>%
# Date
pointblank::col_is_date(vars(Date)) %>%
col_vals_gt(columns = vars(Date), value = "2021-07-01") %>%
# Latitude Longitude
col_is_numeric(vars(Latitude, Longitude)) %>%
col_vals_between(vars(Latitude), left = -39.13658, right = -33.98128) %>%
col_vals_between(vars(Longitude), left = 140.96190, right = 149.97629) %>%
# cameraID
col_vals_regex(columns = vars(CameraID), regex = "^[a-zA-Z]{2}[0-9]{8}$") %>%
# Height
col_vals_not_null(vars(CamHeight)) %>%
col_vals_equal(vars(CamHeight), value = 1) %>%
col_vals_not_null(vars(DistanceM1, DistanceM2, DistanceM3, DistanceM4)) %>%
col_vals_equal(vars(DistanceM1), value = 2.5) %>%
col_vals_equal(vars(DistanceM2), value = 5) %>%
col_vals_equal(vars(DistanceM3), value = 7.5) %>%
col_vals_equal(vars(DistanceM4), value = 10) %>%
# CamBearing
col_vals_between(vars(CamBearing), left = 0, right = 360) %>%
col_vals_between(vars(CamBearing), left = 135, right = 225, brief = "South facing camera") %>%
# Slope
col_vals_between(vars(Slope), left = 0, right = 100) %>%
#Trail
col_vals_in_set(vars(Trail), set = c("Yes", "No", "Not sure")) %>%
col_vals_not_null(vars(DistanceToTrail), preconditions = ~ . %>% dplyr::filter(Trail == "Yes")) %>%
# Images
col_vals_not_null(vars(ImageNorth, ImageEast, ImageSouth, ImageWest)) %>%
#Transects
col_vals_equal(vars(TransDistance), value = 450) %>%
col_vals_equal(vars(Transects), value = 3) %>%
col_vals_not_null(vars(Pellets, Rubbings, Footprints, Wallows)) %>%
# veg measurements
col_vals_not_null(vars(NWUCover, NNWHUCover, EWUCover, ENWHUCover, BGroundCover, TopHeight, CanopyCov, Saplings, Seedlings)) %>%
interrogate()
site_agent
Pointblank Validation | |||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
VALID-I Curated Site Data
tibble
curated_site_data
|
|||||||||||||
STEP | COLUMNS | VALUES | TBL | EVAL | UNITS | PASS | FAIL | W | S | N | EXT | ||
1 | col_vals_in_set()
|
|
✓ |
100 |
100 1.00 |
0 0.00 |
— |
— |
— |
— | |||
2 | col_vals_between()
|
|
✓ |
100 |
100 1.00 |
0 0.00 |
— |
— |
— |
— | |||
3 | rows_distinct()
|
— |
|
✓ |
100 |
100 1.00 |
0 0.00 |
— |
— |
— |
— | ||
4 | col_vals_not_null()
|
— |
|
✓ |
100 |
100 1.00 |
0 0.00 |
— |
— |
— |
— | ||
5 | col_vals_not_null()
|
— |
|
✓ |
100 |
100 1.00 |
0 0.00 |
— |
— |
— |
— | ||
6 | col_is_date()
|
— |
|
✓ |
1 |
1 1.00 |
0 0.00 |
— |
— |
— |
— | ||
7 | col_vals_gt()
|
|
✓ |
100 |
100 1.00 |
0 0.00 |
— |
— |
— |
— | |||
8 | col_is_numeric()
|
— |
|
✓ |
1 |
1 1.00 |
0 0.00 |
— |
— |
— |
— | ||
9 | col_is_numeric()
|
— |
|
✓ |
1 |
1 1.00 |
0 0.00 |
— |
— |
— |
— | ||
10 | col_vals_between()
|
|
✓ |
100 |
100 1.00 |
0 0.00 |
— |
— |
— |
— | |||
11 | col_vals_between()
|
|
✓ |
100 |
100 1.00 |
0 0.00 |
— |
— |
— |
— | |||
12 | col_vals_regex()
|
|
✓ |
100 |
100 1.00 |
0 0.00 |
— |
— |
— |
— | |||
13 | col_vals_not_null()
|
— |
|
✓ |
100 |
99 0.99 |
1 0.01 |
— |
— |
— |
|||
14 | col_vals_equal()
|
|
✓ |
100 |
96 0.96 |
4 0.04 |
— |
— |
— |
||||
15 | col_vals_not_null()
|
— |
|
✓ |
100 |
100 1.00 |
0 0.00 |
— |
— |
— |
— | ||
16 | col_vals_not_null()
|
— |
|
✓ |
100 |
100 1.00 |
0 0.00 |
— |
— |
— |
— | ||
17 | col_vals_not_null()
|
— |
|
✓ |
100 |
100 1.00 |
0 0.00 |
— |
— |
— |
— | ||
18 | col_vals_not_null()
|
— |
|
✓ |
100 |
100 1.00 |
0 0.00 |
— |
— |
— |
— | ||
19 | col_vals_equal()
|
|
✓ |
100 |
99 0.99 |
1 0.01 |
— |
— |
— |
||||
20 | col_vals_equal()
|
|
✓ |
100 |
99 0.99 |
1 0.01 |
— |
— |
— |
||||
21 | col_vals_equal()
|
|
✓ |
100 |
98 0.98 |
2 0.02 |
— |
— |
— |
||||
22 | col_vals_equal()
|
|
✓ |
100 |
100 1.00 |
0 0.00 |
— |
— |
— |
— | |||
23 | col_vals_between()
|
|
✓ |
100 |
100 1.00 |
0 0.00 |
— |
— |
— |
— | |||
24 | col_vals_between()
|
|
✓ |
100 |
97 0.97 |
3 0.03 |
— |
— |
— |
||||
25 | col_vals_between()
|
|
✓ |
100 |
100 1.00 |
0 0.00 |
— |
— |
— |
— | |||
26 | col_vals_in_set()
|
|
✓ |
100 |
100 1.00 |
0 0.00 |
— |
— |
— |
— | |||
27 | col_vals_not_null()
|
— |
|
✓ |
51 |
51 1.00 |
0 0.00 |
— |
— |
— |
— | ||
28 | col_vals_not_null()
|
— |
|
✓ |
100 |
100 1.00 |
0 0.00 |
— |
— |
— |
— | ||
29 | col_vals_not_null()
|
— |
|
✓ |
100 |
100 1.00 |
0 0.00 |
— |
— |
— |
— | ||
30 | col_vals_not_null()
|
— |
|
✓ |
100 |
100 1.00 |
0 0.00 |
— |
— |
— |
— | ||
31 | col_vals_not_null()
|
— |
|
✓ |
100 |
100 1.00 |
0 0.00 |
— |
— |
— |
— | ||
32 | col_vals_equal()
|
|
✓ |
100 |
83 0.83 |
17 0.17 |
— |
— |
— |
||||
33 | col_vals_equal()
|
|
✓ |
100 |
93 0.93 |
7 0.07 |
— |
— |
— |
||||
34 | col_vals_not_null()
|
— |
|
✓ |
100 |
100 1.00 |
0 0.00 |
— |
— |
— |
— | ||
35 | col_vals_not_null()
|
— |
|
✓ |
100 |
100 1.00 |
0 0.00 |
— |
— |
— |
— | ||
36 | col_vals_not_null()
|
— |
|
✓ |
100 |
100 1.00 |
0 0.00 |
— |
— |
— |
— | ||
37 | col_vals_not_null()
|
— |
|
✓ |
100 |
100 1.00 |
0 0.00 |
— |
— |
— |
— | ||
38 | col_vals_not_null()
|
— |
|
✓ |
100 |
100 1.00 |
0 0.00 |
— |
— |
— |
— | ||
39 | col_vals_not_null()
|
— |
|
✓ |
100 |
100 1.00 |
0 0.00 |
— |
— |
— |
— | ||
40 | col_vals_not_null()
|
— |
|
✓ |
100 |
100 1.00 |
0 0.00 |
— |
— |
— |
— | ||
41 | col_vals_not_null()
|
— |
|
✓ |
100 |
100 1.00 |
0 0.00 |
— |
— |
— |
— | ||
42 | col_vals_not_null()
|
— |
|
✓ |
100 |
100 1.00 |
0 0.00 |
— |
— |
— |
— | ||
43 | col_vals_not_null()
|
— |
|
✓ |
100 |
100 1.00 |
0 0.00 |
— |
— |
— |
— | ||
44 | col_vals_not_null()
|
— |
|
✓ |
100 |
100 1.00 |
0 0.00 |
— |
— |
— |
— | ||
45 | col_vals_not_null()
|
— |
|
✓ |
100 |
100 1.00 |
0 0.00 |
— |
— |
— |
— | ||
46 | col_vals_not_null()
|
— |
|
✓ |
100 |
100 1.00 |
0 0.00 |
— |
— |
— |
— | ||
2022-05-17 10:31:04 AEST 2.0 s 2022-05-17 10:31:06 AEST |
This data quality statement focuses on data collected during camera retrieval and focuses on whether the data collected during retrieval is valid and can be matched up to the deployment data.
retrieval_agent <-
create_agent(
tbl = curated_site_retrieval_data,
tbl_name = "curated_site_retrieval_data",
label = "VALID-I Curated Retrieval Data"
) %>% # Site ID
col_vals_in_set(vars(SiteID), set = c(curated_final_site_selection$ID)) %>%
col_vals_between(vars(SiteID), left = 0, right = max(all_sites)) %>%
col_vals_in_set(vars(SiteID), set = curated_site_data$SiteID) %>%
rows_distinct(columns = vars(SiteID, Iteration)) %>%
# Personel
col_vals_not_null(vars(User_Name, Personnel)) %>%
# Date
pointblank::col_is_date(vars(Date)) %>%
col_vals_gt(columns = vars(Date), value = "2021-07-01") %>%
# cameraID
col_vals_regex(columns = vars(CameraID), regex = "^[a-zA-Z]{2}[0-9]{8}$") %>%
col_vals_in_set(vars(CameraID), set = curated_site_data$CameraID) %>%
interrogate()
retrieval_agent
Pointblank Validation | |||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
VALID-I Curated Retrieval Data
tibble
curated_site_retrieval_data
|
|||||||||||||
STEP | COLUMNS | VALUES | TBL | EVAL | UNITS | PASS | FAIL | W | S | N | EXT | ||
1 | col_vals_in_set()
|
|
✓ |
100 |
100 1.00 |
0 0.00 |
— |
— |
— |
— | |||
2 | col_vals_between()
|
|
✓ |
100 |
100 1.00 |
0 0.00 |
— |
— |
— |
— | |||
3 | col_vals_in_set()
|
|
✓ |
100 |
100 1.00 |
0 0.00 |
— |
— |
— |
— | |||
4 | rows_distinct()
|
— |
|
✓ |
100 |
100 1.00 |
0 0.00 |
— |
— |
— |
— | ||
5 | col_vals_not_null()
|
— |
|
✓ |
100 |
100 1.00 |
0 0.00 |
— |
— |
— |
— | ||
6 | col_vals_not_null()
|
— |
|
✓ |
100 |
100 1.00 |
0 0.00 |
— |
— |
— |
— | ||
7 | col_is_date()
|
— |
|
✓ |
1 |
1 1.00 |
0 0.00 |
— |
— |
— |
— | ||
8 | col_vals_gt()
|
|
✓ |
100 |
100 1.00 |
0 0.00 |
— |
— |
— |
— | |||
9 | col_vals_regex()
|
|
✓ |
100 |
100 1.00 |
0 0.00 |
— |
— |
— |
— | |||
10 | col_vals_in_set()
|
|
✓ |
100 |
90 0.90 |
10 0.10 |
— |
— |
— |
||||
2022-05-17 10:31:09 AEST < 1 s 2022-05-17 10:31:10 AEST |
Using the R
package camtrapR
we loaded 10949 photos of deer from the retrieved cameras. The following data quality statement ensures that photos of deer have been appropriately tagged with key metadata.
camtrap_agent <-
create_agent(
tbl = raw_camtrap_records,
tbl_name = "raw_camtrap_records",
label = "VALID-I Raw Camera Trap Photo Data"
) %>% # Site ID
col_vals_in_set(vars(Station), set = c(curated_final_site_selection$ID)) %>%
col_vals_between(vars(Station), left = 0, right = max(all_sites)) %>%
col_vals_in_set(vars(Station), set = curated_site_data$SiteID) %>%
# Personel
col_vals_not_null(vars(Station, Species, Date, Time, metadata_Distance)) %>%
# Date
pointblank::col_is_date(vars(Date)) %>%
col_vals_gt(columns = vars(Date), value = "2021-07-01") %>%
# cameraID
interrogate()
camtrap_agent
Pointblank Validation | |||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
VALID-I Raw Camera Trap Photo Data
tibble
raw_camtrap_records
|
|||||||||||||
STEP | COLUMNS | VALUES | TBL | EVAL | UNITS | PASS | FAIL | W | S | N | EXT | ||
1 | col_vals_in_set()
|
|
✓ |
11K |
11K 1.00 |
0 0.00 |
— |
— |
— |
— | |||
2 | col_vals_between()
|
|
✓ |
11K |
11K 1.00 |
0 0.00 |
— |
— |
— |
— | |||
3 | col_vals_in_set()
|
|
✓ |
11K |
11K 1.00 |
0 0.00 |
— |
— |
— |
— | |||
4 | col_vals_not_null()
|
— |
|
✓ |
11K |
11K 1.00 |
0 0.00 |
— |
— |
— |
— | ||
5 | col_vals_not_null()
|
— |
|
✓ |
11K |
11K 1.00 |
0 0.00 |
— |
— |
— |
— | ||
6 | col_vals_not_null()
|
— |
|
✓ |
11K |
11K 1.00 |
0 0.00 |
— |
— |
— |
— | ||
7 | col_vals_not_null()
|
— |
|
✓ |
11K |
11K 1.00 |
0 0.00 |
— |
— |
— |
— | ||
8 | col_vals_not_null()
|
— |
|
✓ |
11K |
11K 1.00 |
0 0.00 |
— |
— |
— |
— | ||
9 | col_is_date()
|
— |
|
✓ |
1 |
1 1.00 |
0 0.00 |
— |
— |
— |
— | ||
10 | col_vals_gt()
|
|
✓ |
11K |
11K 1.00 |
0 0.00 |
— |
— |
— |
— | |||
2022-05-17 10:31:11 AEST < 1 s 2022-05-17 10:31:12 AEST |
If you see mistakes or want to suggest changes, please create an issue on the source repository.
For attribution, please cite this work as
Cally (2022, May 17). Justin's Code Blog: Deer Monitoring Data Quality. Retrieved from https://justincally.github.io/blog/posts/2022-05-17-deer-monitoring-data-quality/
BibTeX citation
@misc{cally2022deerdataquality, author = {Cally, Justin G}, title = {Justin's Code Blog: Deer Monitoring Data Quality}, url = {https://justincally.github.io/blog/posts/2022-05-17-deer-monitoring-data-quality/}, year = {2022} }