A data quality assessment of statewide deer monitoring data in 2023
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>",
layout="l-screen-inset"
)
# library(deervic)
library(sf)
library(DBI)
library(dplyr)
library(pointblank)
# Postgres connection: better for rasters (and easier to set up)
con <- RPostgreSQL::dbConnect(RPostgres::Postgres(),
host = '10.110.7.201',
dbname = 'ari-dev-weda-psql-01',
user = "psql_admin",
password = keyring::key_get(service = "ari-dev-weda-psql-01", username = "psql_admin"),
port = 5432,
service = NULL,
list(sslmode = "require"))
curated_final_site_selection <- st_read(dsn = con,
layer = Id(schema = "deervic", table = "curated_final_site_selection"))
rawdir <- "/Volumes/Cally_Camtr/StatewideDeer/RawFiles"
stations <- list.dirs(rawdir, recursive = F, full.names = F)
curated_site_data <- tbl(con, dbplyr::in_schema(schema = "deervic", table = "curated_site_data")) %>%
filter(SiteID %in% !!stations & User_Name != "Justin") %>%
collect() %>%
mutate(Latitude = as.numeric(Latitude),
Longitude = as.numeric(Longitude),
Date = as.Date(Date, format = "%m/%d/%Y"))
curated_site_retrieval_data <- tbl(con, dbplyr::in_schema(schema = "deervic", table = "curated_site_retrieval_data")) %>%
filter(SiteID %in% !!stations & User_Name != "Justin") %>%
collect() %>%
mutate(Date = as.Date(Date, format = "%m/%d/%Y"))
data_dictionary <- tbl(con, dbplyr::in_schema(schema = "deervic", table = "data_dictionary")) %>%
collect()
all_sites <- tbl(con, dbplyr::in_schema(schema = "deervic", table = "raw_sampling_grid")) %>%
dplyr::select(ID) %>%
collect() %>%
pull()
# Select deer species of interest
# deer_species <- c("Dama dama", "Rusa unicolor")
#
# raw_camtrap_records <- tbl(con, dbplyr::in_schema(schema = "deervic", table = "raw_camtrap_records")) %>%
# filter(Species %in% !!deer_species) %>%
# collect()
The following data quality statement is sourced from data relating to the initial site assessment and camera deployment. This data quality statement can help identify issues relating to the deployment of camera traps and site assessments such is unequal sampling/search area (transects) or data entry issues.
site_agent <-
create_agent(
tbl = curated_site_data,
tbl_name = "curated_site_data",
label = "VALID-I Curated Site Data", actions = action_levels(stop_at = 1),
) %>% # Site ID
col_vals_in_set(vars(SiteID), set = c(curated_final_site_selection$ID)) %>%
col_vals_between(vars(SiteID), left = 0, right = max(all_sites)) %>%
rows_distinct(columns = vars(SiteID, Iteration)) %>%
# Personel
col_vals_not_null(vars(User_Name, Personnel)) %>%
# Date
pointblank::col_is_date(vars(Date)) %>%
col_vals_gt(columns = vars(Date), value = "2021-07-01") %>%
# Latitude Longitude
col_is_numeric(vars(Latitude, Longitude)) %>%
col_vals_between(vars(Latitude), left = -39.13658, right = -33.98128) %>%
col_vals_between(vars(Longitude), left = 140.96190, right = 149.97629) %>%
# cameraID
col_vals_regex(columns = vars(CameraID), regex = "^[a-zA-Z]{2}[0-9]{8}$") %>%
# Height
col_vals_not_null(vars(CamHeight)) %>%
col_vals_equal(vars(CamHeight), value = 1) %>%
col_vals_not_null(vars(DistanceM1, DistanceM2, DistanceM3, DistanceM4)) %>%
col_vals_equal(vars(DistanceM1), value = 2.5) %>%
col_vals_equal(vars(DistanceM2), value = 5) %>%
col_vals_equal(vars(DistanceM3), value = 7.5) %>%
col_vals_equal(vars(DistanceM4), value = 10) %>%
# CamBearing
col_vals_between(vars(CamBearing), left = 0, right = 360) %>%
col_vals_between(vars(CamBearing), left = 135, right = 225, brief = "South facing camera") %>%
# Slope
col_vals_between(vars(Slope), left = 0, right = 100) %>%
#Trail
col_vals_in_set(vars(Trail), set = c("Yes", "No", "Not sure")) %>%
col_vals_not_null(vars(DistanceToTrail), preconditions = ~ . %>% dplyr::filter(Trail == "Yes")) %>%
# Images
col_vals_not_null(vars(ImageNorth, ImageEast, ImageSouth, ImageWest)) %>%
#Transects
col_vals_equal(vars(TransDistance), value = 450) %>%
col_vals_equal(vars(Transects), value = 3) %>%
col_vals_not_null(vars(Pellets, Rubbings, Footprints, Wallows)) %>%
# veg measurements
col_vals_not_null(vars(NWUCover, NNWHUCover, EWUCover, ENWHUCover, BGroundCover, TopHeight, CanopyCov, Saplings, Seedlings)) %>%
interrogate()
site_agent
Pointblank Validation | |||||||||||||
VALID-I Curated Site Data
tibble
curated_site_dataWARN
—
STOP
1
NOTIFY
—
|
|||||||||||||
STEP | COLUMNS | VALUES | TBL | EVAL | UNITS | PASS | FAIL | W | S | N | EXT | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | col_vals_in_set()
|
|
✓ |
50 |
50 1.00 |
0 0.00 |
— |
○ |
— |
— | |||
2 | col_vals_between()
|
|
✓ |
50 |
50 1.00 |
0 0.00 |
— |
○ |
— |
— | |||
3 | rows_distinct()
|
— |
|
✓ |
50 |
50 1.00 |
0 0.00 |
— |
○ |
— |
— | ||
4 | col_vals_not_null()
|
— |
|
✓ |
50 |
50 1.00 |
0 0.00 |
— |
○ |
— |
— | ||
5 | col_vals_not_null()
|
— |
|
✓ |
50 |
50 1.00 |
0 0.00 |
— |
○ |
— |
— | ||
6 | col_is_date()
|
— |
|
✓ |
1 |
1 1.00 |
0 0.00 |
— |
○ |
— |
— | ||
7 | col_vals_gt()
|
|
✓ |
50 |
50 1.00 |
0 0.00 |
— |
○ |
— |
— | |||
8 | col_is_numeric()
|
— |
|
✓ |
1 |
1 1.00 |
0 0.00 |
— |
○ |
— |
— | ||
9 | col_is_numeric()
|
— |
|
✓ |
1 |
1 1.00 |
0 0.00 |
— |
○ |
— |
— | ||
10 | col_vals_between()
|
|
✓ |
50 |
31 0.62 |
19 0.38 |
— |
● |
— |
||||
11 | col_vals_between()
|
|
✓ |
50 |
45 0.90 |
5 0.10 |
— |
● |
— |
||||
12 | col_vals_regex()
|
|
✓ |
50 |
0 0.00 |
50 1.00 |
— |
● |
— |
||||
13 | col_vals_not_null()
|
— |
|
✓ |
50 |
50 1.00 |
0 0.00 |
— |
○ |
— |
— | ||
14 | col_vals_equal()
|
|
✓ |
50 |
46 0.92 |
4 0.08 |
— |
● |
— |
||||
15 | col_vals_not_null()
|
— |
|
✓ |
50 |
50 1.00 |
0 0.00 |
— |
○ |
— |
— | ||
16 | col_vals_not_null()
|
— |
|
✓ |
50 |
50 1.00 |
0 0.00 |
— |
○ |
— |
— | ||
17 | col_vals_not_null()
|
— |
|
✓ |
50 |
50 1.00 |
0 0.00 |
— |
○ |
— |
— | ||
18 | col_vals_not_null()
|
— |
|
✓ |
50 |
50 1.00 |
0 0.00 |
— |
○ |
— |
— | ||
19 | col_vals_equal()
|
|
✓ |
50 |
49 0.98 |
1 0.02 |
— |
● |
— |
||||
20 | col_vals_equal()
|
|
✓ |
50 |
49 0.98 |
1 0.02 |
— |
● |
— |
||||
21 | col_vals_equal()
|
|
✓ |
50 |
49 0.98 |
1 0.02 |
— |
● |
— |
||||
22 | col_vals_equal()
|
|
✓ |
50 |
50 1.00 |
0 0.00 |
— |
○ |
— |
— | |||
23 | col_vals_between()
|
|
✓ |
50 |
50 1.00 |
0 0.00 |
— |
○ |
— |
— | |||
24 | col_vals_between()
|
|
✓ |
50 |
47 0.94 |
3 0.06 |
— |
● |
— |
||||
25 | col_vals_between()
|
|
✓ |
50 |
48 0.96 |
2 0.04 |
— |
● |
— |
||||
26 | col_vals_in_set()
|
|
✓ |
50 |
50 1.00 |
0 0.00 |
— |
○ |
— |
— | |||
27 | col_vals_not_null()
|
— |
|
✓ |
19 |
19 1.00 |
0 0.00 |
— |
○ |
— |
— | ||
28 | col_vals_not_null()
|
— |
|
✓ |
50 |
0 0.00 |
50 1.00 |
— |
● |
— |
|||
29 | col_vals_not_null()
|
— |
|
✓ |
50 |
0 0.00 |
50 1.00 |
— |
● |
— |
|||
30 | col_vals_not_null()
|
— |
|
✓ |
50 |
0 0.00 |
50 1.00 |
— |
● |
— |
|||
31 | col_vals_not_null()
|
— |
|
✓ |
50 |
0 0.00 |
50 1.00 |
— |
● |
— |
|||
32 | col_vals_equal()
|
|
✓ |
50 |
43 0.86 |
7 0.14 |
— |
● |
— |
||||
33 | col_vals_equal()
|
|
✓ |
50 |
50 1.00 |
0 0.00 |
— |
○ |
— |
— | |||
34 | col_vals_not_null()
|
— |
|
✓ |
50 |
50 1.00 |
0 0.00 |
— |
○ |
— |
— | ||
35 | col_vals_not_null()
|
— |
|
✓ |
50 |
50 1.00 |
0 0.00 |
— |
○ |
— |
— | ||
36 | col_vals_not_null()
|
— |
|
✓ |
50 |
50 1.00 |
0 0.00 |
— |
○ |
— |
— | ||
37 | col_vals_not_null()
|
— |
|
✓ |
50 |
50 1.00 |
0 0.00 |
— |
○ |
— |
— | ||
38 | col_vals_not_null()
|
— |
|
✓ |
50 |
50 1.00 |
0 0.00 |
— |
○ |
— |
— | ||
39 | col_vals_not_null()
|
— |
|
✓ |
50 |
50 1.00 |
0 0.00 |
— |
○ |
— |
— | ||
40 | col_vals_not_null()
|
— |
|
✓ |
50 |
50 1.00 |
0 0.00 |
— |
○ |
— |
— | ||
41 | col_vals_not_null()
|
— |
|
✓ |
50 |
50 1.00 |
0 0.00 |
— |
○ |
— |
— | ||
42 | col_vals_not_null()
|
— |
|
✓ |
50 |
50 1.00 |
0 0.00 |
— |
○ |
— |
— | ||
43 | col_vals_not_null()
|
— |
|
✓ |
50 |
50 1.00 |
0 0.00 |
— |
○ |
— |
— | ||
44 | col_vals_not_null()
|
— |
|
✓ |
50 |
50 1.00 |
0 0.00 |
— |
○ |
— |
— | ||
45 | col_vals_not_null()
|
— |
|
✓ |
50 |
50 1.00 |
0 0.00 |
— |
○ |
— |
— | ||
46 | col_vals_not_null()
|
— |
|
✓ |
50 |
50 1.00 |
0 0.00 |
— |
○ |
— |
— | ||
2023-03-27 10:51:06 AEDT 3.2 s 2023-03-27 10:51:09 AEDT |
This data quality statement focuses on data collected during camera retrieval and focuses on whether the data collected during retrieval is valid and can be matched up to the deployment data.
retrieval_agent <-
create_agent(
tbl = curated_site_retrieval_data,
tbl_name = "curated_site_retrieval_data",
label = "VALID-I Curated Retrieval Data", actions = action_levels(stop_at = 1),
) %>% # Site ID
col_vals_in_set(vars(SiteID), set = c(curated_final_site_selection$ID)) %>%
col_vals_between(vars(SiteID), left = 0, right = max(all_sites)) %>%
col_vals_make_set(vars(SiteID), set = curated_site_data$SiteID) %>%
rows_distinct(columns = vars(SiteID, Iteration)) %>%
# Personel
col_vals_not_null(vars(User_Name, Personnel)) %>%
# Date
pointblank::col_is_date(vars(Date)) %>%
col_vals_gt(columns = vars(Date), value = "2021-07-01") %>%
# cameraID
col_vals_regex(columns = vars(CameraID), regex = "^[a-zA-Z]{2}[0-9]{8}$") %>%
col_vals_in_set(vars(CameraID), set = curated_site_data$CameraID) %>%
interrogate()
retrieval_agent
Pointblank Validation | |||||||||||||
VALID-I Curated Retrieval Data
tibble
curated_site_retrieval_dataWARN
—
STOP
1
NOTIFY
—
|
|||||||||||||
STEP | COLUMNS | VALUES | TBL | EVAL | UNITS | PASS | FAIL | W | S | N | EXT | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | col_vals_in_set()
|
|
✓ |
48 |
48 1.00 |
0 0.00 |
— |
○ |
— |
— | |||
2 | col_vals_between()
|
|
✓ |
48 |
48 1.00 |
0 0.00 |
— |
○ |
— |
— | |||
3 | col_vals_make_set()
|
|
✓ |
51 |
49 0.96 |
2 0.04 |
— |
● |
— |
||||
4 | rows_distinct()
|
— |
|
✓ |
48 |
48 1.00 |
0 0.00 |
— |
○ |
— |
— | ||
5 | col_vals_not_null()
|
— |
|
✓ |
48 |
48 1.00 |
0 0.00 |
— |
○ |
— |
— | ||
6 | col_vals_not_null()
|
— |
|
✓ |
48 |
48 1.00 |
0 0.00 |
— |
○ |
— |
— | ||
7 | col_is_date()
|
— |
|
✓ |
1 |
1 1.00 |
0 0.00 |
— |
○ |
— |
— | ||
8 | col_vals_gt()
|
|
✓ |
48 |
48 1.00 |
0 0.00 |
— |
○ |
— |
— | |||
9 | col_vals_regex()
|
|
✓ |
48 |
0 0.00 |
48 1.00 |
— |
● |
— |
||||
10 | col_vals_in_set()
|
|
✓ |
48 |
46 0.96 |
2 0.04 |
— |
● |
— |
||||
2023-03-27 10:51:13 AEDT < 1 s 2023-03-27 10:51:13 AEDT |
If you see mistakes or want to suggest changes, please create an issue on the source repository.
For attribution, please cite this work as
Cally (2023, March 3). Justin's Code Blog: Deer Monitoring Data Quality Statewide Surveys By Wildlife Unlimited. Retrieved from https://justincally.github.io/blog/posts/2023-03-03-deer-dq-23/
BibTeX citation
@misc{cally2022deerdataquality23, author = {Cally, Justin G}, title = {Justin's Code Blog: Deer Monitoring Data Quality Statewide Surveys By Wildlife Unlimited}, url = {https://justincally.github.io/blog/posts/2023-03-03-deer-dq-23/}, year = {2023} }