2.4 Clean and format data

Cleaning and format procedures, including coercing variables as numeric or factor, excluding columns (constants, perturbed, unreliable) and rows (incomplete data, outliers).

cleanAmphorae <- clean_and_format(
  amphorae,
  completion_variable = c(
    # The variable with completion info
    "CHARAC", 
    # the value indicating completion
    "complete"
  ), 
  categorical_columns = 1:112, 
  numerical_columns = 113:ncol(amphorae),
  # values converted to NA
  as_na = c("NULL", "indeterminate", "unfired"),
  # method for replacing NAs
  method = NULL, 
  # don't use the following variables
  columns_to_exclude = c("VOID_VESIC_MEGA", "VOID_VUGH_MEGA",
                         "VOID_CHAN_MEGA", "VOID_PLAN_MEGA",
                         "COAR_R_DAC_AND", "COAR_R_EVAP",
                         "COAR_R_CONGBREC", "COAR_R_SERP",
                         "COAR_C_SPL", "COAR_C_OPX",
                         "COAR_C_OL", "COAR_C_SIL",
                         "COAR_C_ST", "COAR_C_ZRN",
                         "COAR_C_PY", "FINE_C_OPX",
                         "FINE_C_ZRN"),
  # don't use the following observations
  # (Italic amphorae from Port Vendres 4)
  rows_to_exclude = c("PV4033", # PV4-IND4
                      "PV4017", # PV4-CAMP
                      # PV4-ITT
                      "PV4021", "PV4023", "PV4024", 
                      "PV4025", "PV4035", "PV4037",
                      # PV4-NAP
                      "PV4022", "PV4026", "PV4027", 
                      "PV4028", "PV4029", "PV4030",
                      "PV4036")
)
Variables Observations
amphorae 138 238
cleanAmphorae 91 223