# Import Required Libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from pytaxize import scicomm

# Read the file without headers
columns = ['uuid', 'class', 'order', 'family', 'genus', 'species', 'common_name']
df_taxa = pd.read_csv("taxonomy_release.txt", 
                 delimiter=";", 
                 names=columns,
                 header=None)

# Display the first few rows to verify
display(df_taxa.head())

# Sort taxa by level 
df_sorted = df_taxa.sort_values(by=['class', 'order', 'family', 'genus', 'species'], ascending=True)
display(df_sorted.head())

WITH unique_species AS (
    SELECT DISTINCT
    class,
    `order`,
    family,
    genus,
    species,
    taxonkey

    FROM
    `bigquery-public-data.gbif.occurrences` 

    WHERE 
    ST_WITHIN(
        ST_GEOGPOINT(decimallongitude, decimallatitude),
        ST_GEOGFROMTEXT('POLYGON((
            -87.69081115722656 42.005312912238956, 
            -87.66952514648438 41.955818412264705, 
            -87.61596679687501 41.905774595463853, 
            -87.60910034179689 41.85779952612765, 
            -87.62626647949219 41.815801430687642, 
            -87.7196502685547 41.808127409160392, 
            -87.71690368652345 41.842908943268263, 
            -87.67982482910158 41.88533726561532, 
            -87.72377014160158 41.946119107705776, 
            -87.78625488281251 41.99051961904691, 
            -87.69081115722656 42.005312912238956
        ))')
    )
    AND LOWER(phylum) = "chordata" # This is the phylum that includes mammals and birds
    
    LIMIT 1000 
)

SELECT
  t1.*,
  ARRAY_AGG(DISTINCT countrycode IGNORE NULLS) AS country_codes

FROM unique_species t1
LEFT JOIN `bigquery-public-data.gbif.occurrences` 
USING(taxonkey)

GROUP BY 
  class,
  `order`,
  family,
  genus,
  species,
  taxonkey

# Try the gbq download
df = pd.read_csv("bq-withcountry.csv")

df_filtered = df.map(lambda x: x.lower() if isinstance(x, str) else x)

# extract actual speciesname
df_filtered.rename(columns={'species': 'scientific_name'}, inplace=True)

# create an actual species column
df_filtered['species'] = df_filtered['scientific_name'].str.split().str[-1]

df_gbq = df_filtered.copy()

display(df_filtered.head())

# Merge dataframes on taxa hierarchy
merged_df = pd.merge(
    df_gbq,
    df_taxa[['class', 'order', 'family', 'genus', 'species', 'common_name']],
    on=['class', 'order', 'family', 'genus', 'species'],
    how='left'
)
merged_df.tail()

# Display matches and non-matches
print("Rows with matching common names:")
display(merged_df[merged_df['common_name'].notna()].head())

print("\nRows without matching common names:")
display(merged_df[merged_df['common_name'].isna()].head())

# Count matches
total_matches = merged_df['common_name'].notna().sum()
print(f"\nTotal matches found: {total_matches}")
print(f"Total rows without matches: {len(merged_df) - total_matches}")

Rows with matching common names:

Rows without matching common names:

Total matches found: 391
Total rows without matches: 534

# Save the matching dataset
df_matched = merged_df[merged_df['common_name'].notna()]

df_matched.to_csv('gbq_to_taxa_common_matched.csv')

# Store the matching common_names for comparison to the speciesnet output
chicago_species = df_matched['common_name'].tolist()

print(chicago_species)

['cattle egret', 'crested partridge', 'pied-billed grebe', 'rock dove', 'animal', 'vehicle', 'blank', 'yellow-crowned night-heron', 'great blue heron', 'busard saint-martin', 'lesser black-backed gull', 'northern mockingbird', 'belted kingfisher', 'golden eagle', 'blue-winged teal', 'american wigeon', 'blue grosbeak', 'eastern cottontail', 'cinnamon teal', "wilson's snipe", 'green heron', 'great blue heron', 'peromyscus species', 'long-billed dowitcher', 'brown thrasher', 'myrtle warbler', 'blue jay', 'sharp-tailed grouse', 'marsh wren', 'double-crested cormorant', 'mexican flying squirrel', 'american robin', 'coyote', 'red-headed woodpecker', 'black-capped chickadee', 'brown-headed cowbird', 'hairy woodpecker', "cassin's sparrow", 'spotted sandpiper', 'héron à dos vert', 'empidonax species', 'common grackle', 'moorhen', 'belted kingfisher', 'hairy woodpecker', 'red-winged blackbird', 'hairy woodpecker', 'american badger', 'chukar', 'larus species', 'short-eared owl', 'héron à dos vert', 'domestic horse', 'zosterops species', 'faucon pèlerin', 'house mouse', 'mute swan', 'spotted towhee', 'american beaver', 'gadwall', 'chipping sparrow', 'eurasian collared-dove', 'common starling', 'russet-backed thrush', 'california gull', 'wren', 'ring-necked pheasant', 'superb starling', 'european rabbit', 'american black duck', 'common yellowthroat', 'north american river otter', 'vesper sparrow', 'house wren', 'antrostomus species', 'swamp sparrow', 'eastern screech-owl', 'western meadowlark', 'northern bobwhite', 'pine warbler', 'song sparrow', 'russet-backed thrush', 'american crow', 'horned lark', 'sphyrapicus species', 'blue jay', 'muscovy duck', 'botaurus species', 'virginia rail', 'domestic cat', 'sciurus species', 'solitary sandpiper', 'red fox-sparrow', 'ovenbird', 'house rat', 'canada goose', 'goosander', 'microtus species', 'moorhen', 'great crested flycatcher', 'double-crested cormorant', 'spotted sandpiper', 'accipitridae family', 'short-eared owl', 'american white pelican', 'veery', 'yellow-headed blackbird', 'marsh wren', 'common ground-dove', 'eastern kingbird', 'palm warbler', 'woodchuck', 'northern goshawk', 'red-shouldered hawk', 'double-crested cormorant', 'colaptes species', 'eastern chipmunk', 'barred owl', 'dark-eyed junco', 'great horned owl', 'gray-cheeked thrush', 'brown thrasher', 'long-eared owl', 'gray-cheeked thrush', 'house sparrow', 'house finch', 'greater white-fronted goose', 'northern bobwhite', 'grey catbird', 'barn swallow', 'house sparrow', 'mourning dove', 'golden eagle', 'ring-necked duck', 'common yellowthroat', 'mallard', 'faucon pèlerin', 'mourning dove', 'song sparrow', 'american avocet', 'hirundinidae family', 'gray-cheeked thrush', 'black-and-white warbler', 'western tanager', 'american coot', 'virginia opossum', 'great white egret', 'red-shouldered hawk', 'western grebe', 'song sparrow', 'blue jay', 'pine warbler', 'mallard', 'eastern screech-owl', 'barred owl', 'eastern bluebird', 'common nighthawk', 'eastern fox squirrel', "steller's jay", 'killdeer', 'rough-legged buzzard', 'little blue heron', 'anatidae family', 'white-winged dove', 'moorhen', 'american kestrel', 'weasel species', 'red-bellied woodpecker', 'green heron', 'chipping sparrow', 'western kingbird', 'greater yellowlegs', 'snowy egret', 'wood duck', 'canada goose', 'muskrat', 'downy woodpecker', 'palm warbler', "wilson's warbler", 'song sparrow', 'ovenbird', 'cedar waxwing', 'loggerhead shrike', 'swamp sparrow', 'north american deermouse', 'lark sparrow', 'grasshopper sparrow', "swainson's hawk", 'red-headed woodpecker', 'white-breasted nuthatch', "cooper's hawk", 'eastern phoebe', "williamson's sapsucker", 'lark bunting', 'eastern gray squirrel', 'brown rat', 'black-bellied whistling-duck', 'upland sandpiper', 'domestic turkey', 'russet-backed thrush', 'moorhen', 'common grackle', 'grey wolf', 'pine warbler', 'dark-eyed junco', 'belted kingfisher', 'northern raccoon', 'american redstart', 'busard saint-martin', 'american bittern', 'bald eagle', 'wood thrush', 'rusty blackbird', "bewick's wren", 'northern cardinal', "wilson's warbler", 'song sparrow', 'northern raccoon', 'bald eagle', 'clay-colored sparrow', "brewer's blackbird", 'house finch', 'long-tailed weasel', 'rusty blackbird', 'red-breasted nuthatch', 'palm warbler', 'moorhen', 'aythya species', 'least bittern', 'grey wolf', 'northern flicker', 'domestic guineafowl', 'common barn owl', 'mallard', 'horned lark', 'willet', 'american wigeon', 'northern pintail', 'goosander', 'goéland argenté', 'red-winged blackbird', 'american crow', 'common starling', 'rallidae family', 'broad-winged hawk', 'green-winged teal', 'dark-eyed junco', 'black-headed grosbeak', 'broad-winged hawk', 'house finch', 'red fox', 'long-eared owl', 'palm warbler', 'northern flicker', "gambel's quail", 'white-breasted nuthatch', 'palm warbler', 'eastern gray squirrel', 'buff-bellied pipit', 'hairy woodpecker', 'grasshopper sparrow', 'eastern wood-pewee', "bewick's wren", "cassin's sparrow", 'domestic turkey', 'northern raccoon', 'marsh wren', 'bat', 'blue grosbeak', 'mallard', 'least bittern', 'red fox', 'black-crowned night-heron', 'ring-necked pheasant', 'cattle egret', 'killdeer', 'red-tailed hawk', 'rock dove', 'sylvilagus species', 'black-crowned night-heron', 'virginia rail', 'california quail', 'blue-winged teal', 'white-throated sparrow', 'myrtle warbler', 'geothlypis species', 'golden-crowned kinglet', 'busard saint-martin', 'setophaga species', 'hooded merganser', 'downy woodpecker', 'canada goose', 'hermit thrush', 'pine warbler', 'veery', 'american robin', 'hermit thrush', 'great horned owl', 'red fox-sparrow', 'veery', 'varied thrush', 'northern goshawk', 'american herring gull', 'vesper sparrow', 'loggerhead shrike', 'lark sparrow', 'song sparrow', 'red-winged blackbird', 'marsh wren', 'striped skunk', 'black-billed magpie', 'white-crowned sparrow', "townsend's solitaire", 'budgerigar', 'rock dove', 'red junglefowl', 'red-tailed hawk', 'bird', 'rock dove', 'northern goshawk', 'rufous-sided towhee', 'black-capped chickadee', 'buff-bellied pipit', 'gadwall', 'buteo species', 'house mouse', 'dryobates species', 'great black-backed gull', 'bufflehead', 'junco species', 'striped skunk', 'ring-billed gull', 'house wren', 'green peafowl', 'brown-headed cowbird', 'american woodcock', 'white ibis', 'white-crowned sparrow', 'spotted towhee', 'ruby-crowned kinglet', 'ruby-crowned kinglet', 'red-winged blackbird', 'swamp sparrow', 'eastern fox squirrel', 'black-billed magpie', "wilson's warbler", 'house finch', 'eastern screech-owl', 'dark-eyed junco', 'eastern screech-owl', 'north american deermouse', 'ovenbird', 'american herring gull', 'rattus species', 'jabiru', 'common nighthawk', "wilson's warbler", 'barn swallow', 'american coot', 'belted kingfisher', 'great white egret', 'tufted titmouse', 'northern cardinal', 'black-crowned night-heron', 'marbled godwit', 'sora', 'goldfinch', 'solitary sandpiper', 'golden-crowned kinglet', 'passer species', 'ruby-crowned kinglet', 'myrtle warbler', 'greater white-fronted goose', 'myrtle warbler', 'pied-billed grebe', 'american kestrel', 'sage thrasher', 'virginia opossum', 'eastern bluebird', 'mallard', 'grey crowned-crane', 'coyote', "wilson's snipe", 'white-footed mouse', 'downy woodpecker', 'black-capped chickadee', 'rufous-sided towhee', 'ruffed grouse', 'ferruginous hawk', 'northern flicker', 'red fox-sparrow']

# Load Species Data
species_data = pd.read_csv('../runs/full-no-geo/simple_class_counts.csv')
species_data.reset_index(drop=True, inplace=True)
species_data.head()

# Compare Species Lists
species_list = species_data['class'].tolist()
mismatched_species = [species for species in species_list if species not in chicago_species]
print(f"Total mismatched species: {len(mismatched_species)}")
print(f"Total matched species: {len(species_list)-len(mismatched_species)}")

Total mismatched species: 38
Total matched species: 25

# Generate Statistics on Mismatched Species
total_species = len(species_list)
mismatched_percentage = (len(mismatched_species) / total_species) * 100
matched_percentage = ( (total_species - len(mismatched_species) ) / total_species) * 100
print(f"Percentage of mismatched species: {mismatched_percentage:.2f}%")
print(f"Percentage of matched species: {matched_percentage:.2f}%")

Percentage of mismatched species: 60.32%
Percentage of matched species: 39.68%

# List the matching vs mismatching species
matching_species = [species for species in species_list if species in chicago_species]
mismatched_species = [species for species in species_list if species not in chicago_species]

print(f"Matching species ({len(matching_species)}): {matching_species}")
print(f"Mismatched species ({len(mismatched_species)}): {mismatched_species}")

Matching species (25): ['blank', 'bird', 'mallard', 'american coot', 'northern raccoon', 'great blue heron', 'vehicle', 'eastern cottontail', 'brown rat', 'domestic cat', 'muskrat', 'wood duck', 'coyote', 'canada goose', 'american beaver', 'eastern gray squirrel', 'domestic horse', 'american robin', 'white-crowned sparrow', 'sylvilagus species', 'song sparrow', 'snowy egret', 'california quail', 'north american river otter', 'eastern chipmunk']
Mismatched species (38): ['human', 'western pond turtle', 'anseriformes order', 'domestic dog', 'reptile', 'domestic cattle', 'wild turkey', 'central american agouti', 'white-tailed deer', 'nutria', 'crocodile', 'wild boar', 'mammal', 'common tapeti', "tome's spiny rat", 'ocellated turkey', 'rodent', 'collared peccary', 'spotted paca', 'madagascar crested ibis', 'canis species', 'red squirrel', 'bushy-tailed woodrat', 'pronghorn', 'eastern red forest rat', 'domestic chicken', 'blood pheasant', 'white-lipped peccary', 'red acouchi', 'desert cottontail', 'plains zebra', 'rufescent tiger-heron', 'owl', 'common wombat', 'bearded pig', 'fossa', 'nine-banded armadillo', "guenther's dik-dik"]

from itertools import zip_longest

review_df = pd.DataFrame(
    list(zip_longest(matching_species, mismatched_species, fillvalue=pd.NA)),
    columns=['matching', 'non_matching (speciesnet!gbif)']
)

print(review_df)

                      matching non_matching (speciesnet!gbif)
0                        blank                          human
1                         bird            western pond turtle
2                      mallard             anseriformes order
3                american coot                   domestic dog
4             northern raccoon                        reptile
5             great blue heron                domestic cattle
6                      vehicle                    wild turkey
7           eastern cottontail        central american agouti
8                    brown rat              white-tailed deer
9                 domestic cat                         nutria
10                     muskrat                      crocodile
11                   wood duck                      wild boar
12                      coyote                         mammal
13                canada goose                  common tapeti
14             american beaver               tome's spiny rat
15       eastern gray squirrel               ocellated turkey
16              domestic horse                         rodent
17              american robin               collared peccary
18       white-crowned sparrow                   spotted paca
19          sylvilagus species        madagascar crested ibis
20                song sparrow                  canis species
21                 snowy egret                   red squirrel
22            california quail           bushy-tailed woodrat
23  north american river otter                      pronghorn
24            eastern chipmunk         eastern red forest rat
25                        <NA>               domestic chicken
26                        <NA>                 blood pheasant
27                        <NA>           white-lipped peccary
28                        <NA>                    red acouchi
29                        <NA>              desert cottontail
30                        <NA>                   plains zebra
31                        <NA>          rufescent tiger-heron
32                        <NA>                            owl
33                        <NA>                  common wombat
34                        <NA>                    bearded pig
35                        <NA>                          fossa
36                        <NA>          nine-banded armadillo
37                        <NA>             guenther's dik-dik

# Some of the mismatched are given and we need to remove them from being counted
to_remove = ['human', 'domestic dog', 'mammal', 'rodent', 'reptile', 'anseriformes order', 'canis species']  # example items to remove, edit as needed
mismatched_species = [s for s in mismatched_species if s not in to_remove]


# Filter species_data for mismatched species
mismatched_stats = species_data[species_data['class'].isin(mismatched_species)][['class', 'count', '%']]

print(mismatched_stats['class'].to_list())
display(mismatched_stats)

['western pond turtle', 'domestic cattle', 'wild turkey', 'central american agouti', 'white-tailed deer', 'nutria', 'crocodile', 'wild boar', 'common tapeti', "tome's spiny rat", 'ocellated turkey', 'collared peccary', 'spotted paca', 'madagascar crested ibis', 'red squirrel', 'bushy-tailed woodrat', 'pronghorn', 'eastern red forest rat', 'domestic chicken', 'blood pheasant', 'white-lipped peccary', 'red acouchi', 'desert cottontail', 'plains zebra', 'rufescent tiger-heron', 'owl', 'common wombat', 'bearded pig', 'fossa', 'nine-banded armadillo', "guenther's dik-dik"]

# for each species on this list that shouldn't have been found in chicago, determine the list of 2 letter countries it belongs to.

df_missmatched = df_taxa[df_taxa['common_name'].isin(mismatched_species)].sort_values(by=['class', 'order', 'family', 'genus', 'species'])

df_missmatched["scientific_name"] = df_missmatched['genus'] + " " + df_missmatched['species']

print(f'total missmatched and in taxa list: {len(df_missmatched)}')
display(df_missmatched.head())

total missmatched and in taxa list: 30

from pygbif import species as species_gbif


def get_gbif_key(name):
    """Get GBIF usageKey (taxonKey) for a single scientific name."""
    if not isinstance(name, str) or not name.strip():
        return None
    try:
        result = species_gbif.name_backbone(name=name)
        return int(result.get("usageKey"))  # safely get key or None
    except Exception as e:
        print(f"Error with name '{name}': {e}")
        return None
    
# Apply function
df_missmatched["taxanomic"] = df_missmatched.apply(lambda row: get_gbif_key(row["scientific_name"]), axis=1)

# Display the result
df_missmatched["taxanomic"] = pd.to_numeric(df_missmatched["taxanomic"], errors='coerce').astype('Int64')


display(df_missmatched.head())

# get country codes
from pygbif import occurrences as occ

def get_countries_and_us_states(taxa_key):
    countries = []
    us_states = []
    try:
        # convert to int if possible, else raise error or skip
        taxa_key_int = int(taxa_key)
        cc = occ.search(taxonKey=taxa_key_int, facet='country', facetLimit=200)
        countries = [item['name'] for item in cc.get('facets', [])[0].get('counts', [])]

        if 'US' in countries:
            sc = occ.search(taxonKey=taxa_key_int, country='US', facet='stateProvince', facetLimit=200)
            us_states = [item['name'] for item in sc.get('facets', [])[0].get('counts', [])]

    except Exception as e:
        print(f"Error with id '{taxa_key}': {e}")
    return {"countries": countries, "us_states": us_states}

# Apply function
df_missmatched[['countries', 'us_states']] = df_missmatched['taxanomic'].apply(get_countries_and_us_states).apply(pd.Series)

Error with id 'nan': cannot convert float NaN to integer

# Finally see the completed df
display(df_missmatched.head())

df_missmatched.to_csv('missmatched_with_locations.csv')

import plotly.express as px

df_exploded = df_missmatched.explode('countries').rename(columns={'countries': 'country_code'})

df_grouped = df_exploded.groupby('country_code')['common_name'].apply(lambda x: '<br>'.join(x.unique())).reset_index()
# Add a presence column for coloring (all 1s)
df_grouped['presence'] = 1

display(df_grouped.head())

import pycountry

def alpha2_to_alpha3(alpha2):
    try:
        return pycountry.countries.get(alpha_2=alpha2).alpha_3
    except:
        return None

df_grouped['country_code_3'] = df_grouped['country_code'].apply(alpha2_to_alpha3)

display(df_grouped.head())

# Drop rows where conversion failed (None)
df_grouped = df_grouped.dropna(subset=['country_code_3'])
fig = px.choropleth(
    df_grouped,
    locations='country_code_3',
    color='presence',             # just presence
    hover_name='country_code',
    hover_data={'common_name': True, 'presence': False},
    color_continuous_scale='Viridis',
    scope='world',
    labels={'common_name': 'Species Found'},
    title='Species Occurrence by Country'
)

fig.update_traces(
    hovertemplate='<b>%{location}</b><br>Species:<br>%{customdata[0]}<extra></extra>'
)

fig.show()

import re

# Mapping full state names to their 2-letter abbreviations
us_states = {
    'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR',
    'California': 'CA', 'Colorado': 'CO', 'Connecticut': 'CT', 'Delaware': 'DE',
    'Florida': 'FL', 'Georgia': 'GA', 'Hawaii': 'HI', 'Idaho': 'ID',
    'Illinois': 'IL', 'Indiana': 'IN', 'Iowa': 'IA', 'Kansas': 'KS',
    'Kentucky': 'KY', 'Louisiana': 'LA', 'Maine': 'ME', 'Maryland': 'MD',
    'Massachusetts': 'MA', 'Michigan': 'MI', 'Minnesota': 'MN', 'Mississippi': 'MS',
    'Missouri': 'MO', 'Montana': 'MT', 'Nebraska': 'NE', 'Nevada': 'NV',
    'New Hampshire': 'NH', 'New Jersey': 'NJ', 'New Mexico': 'NM', 'New York': 'NY',
    'North Carolina': 'NC', 'North Dakota': 'ND', 'Ohio': 'OH', 'Oklahoma': 'OK',
    'Oregon': 'OR', 'Pennsylvania': 'PA', 'Rhode Island': 'RI', 'South Carolina': 'SC',
    'South Dakota': 'SD', 'Tennessee': 'TN', 'Texas': 'TX', 'Utah': 'UT',
    'Vermont': 'VT', 'Virginia': 'VA', 'Washington': 'WA', 'West Virginia': 'WV',
    'Wisconsin': 'WI', 'Wyoming': 'WY', 'District of Columbia': 'DC'
}

# Also allow reverse lookup from abbreviation to itself for convenience
abbrev_set = set(us_states.values())

def normalize_state(state_str):
    """Normalize various US state formats to 2-letter code."""
    if not isinstance(state_str, str):
        return None

    state_str = state_str.strip()

    # Case 1: Matches format like "Texas (TX)"
    m = re.search(r'\((\w{2})\)', state_str)
    if m:
        code = m.group(1).upper()
        if code in abbrev_set:
            return code

    # Case 2: Is already a 2-letter abbreviation?
    if len(state_str) == 2 and state_str.upper() in abbrev_set:
        return state_str.upper()

    # Case 3: Full state name (case insensitive)
    # Capitalize each word for matching keys in us_states dict
    normalized_name = ' '.join(word.capitalize() for word in state_str.split())
    if normalized_name in us_states:
        return us_states[normalized_name]

    # If no match found, return None or original string
    return None

# Assuming your states column is a list of strings per row, e.g. ['Texas (TX)', 'CA', 'Florida']

# Explode list into rows
df_exploded = df_missmatched.explode('us_states')

# Normalize states
df_exploded['state_code'] = df_exploded['us_states'].apply(normalize_state)

# Drop invalid or None
df_exploded = df_exploded.dropna(subset=['state_code'])

# Now you can group and plot with these cleaned 2-letter codes
df_exploded.head()

# Group by state to aggregate species names
df_grouped = df_exploded.groupby('state_code')['common_name'].apply(lambda x: '<br>'.join(x.unique())).reset_index()

# Add presence column (all 1s)
df_grouped['presence'] = 1

# Plot US choropleth
fig = px.choropleth(
    df_grouped,
    locations='state_code',
    locationmode='USA-states',
    color='presence',
    hover_name='state_code',
    hover_data={'common_name': True, 'presence': False},
    scope='usa',
    color_continuous_scale='Viridis',
    labels={'common_name': 'Species Found'},
    title='Species Occurrence by US State'
)

fig.update_traces(
    hovertemplate='<b>%{location}</b><br>Species:<br>%{customdata[0]}<extra></extra>'
)

fig.show()

# Group by common_name and get list of states
df_definite = df_exploded.groupby('common_name')['state_code'].agg(list).reset_index()

# Filter rows where 'IL' is NOT in the state_code list
df_definite = df_definite[~df_definite['state_code'].apply(lambda states: "IL" in states)]

# Save and display
df_definite.to_csv('subset_definitely_not_in_illinois.csv', index=False)
display(df_definite)

	uuid	class	order	family	genus	species	common_name
0	00049ff0-2ffa-4d82-8cf3-c861fbbfa9d5	mammalia	rodentia	muridae	rattus	NaN	rattus species
1	000e4049-11cd-4630-afd6-ea16a908d5ff	mammalia	cetartiodactyla	bovidae	gazella	gazella	mountain gazelle
2	000f61aa-c02a-46f4-b7a7-81fe76a9212f	mammalia	carnivora	canidae	lycaon	NaN	lycaon species
3	001795ae-1963-47f2-91cc-9dd627643a06	mammalia	cetartiodactyla	bovidae	nesotragus	NaN	nesotragus species
4	00339477-70ab-42aa-9a4f-ed2cca9a028f	aves	musophagiformes	musophagidae	tauraco	schuettii	black-billed turaco

	uuid	class	order	family	genus	species	common_name
59	04eda76f-c0e7-4e9e-85c3-5b1542db2915	amphibia	anura	bufonidae	rhinella	marina	cane toad
337	17b8145a-a164-4059-a68b-47b8b7438428	amphibia	anura	ranidae	rana	NaN	rana species
1168	5351aa59-81ba-4c9f-b453-f7a2830ae892	amphibia	anura	ranidae	NaN	NaN	true frogs
2063	96632fbc-d0d0-4880-9df7-f747f6f5ec11	amphibia	anura	NaN	NaN	NaN	frogs
499	23a6f03b-b3d0-471b-a67d-88f10cb64e59	amphibia	NaN	NaN	NaN	NaN	amphibian

	class	order	family	genus	scientific_name	taxonkey	country_codes	species
0	testudines	NaN	emydidae	trachemys	trachemys scripta	7062200	[pl,hu,ar,es,ch,do,cz,at,tw,ae,dk,bm,sk,hn,ro,...	scripta
1	aves	passeriformes	vireonidae	vireo	vireo gilvus	7191625	[ca,gt,us,sv,ni,mx,ec]	gilvus
2	aves	passeriformes	parulidae	protonotaria	protonotaria citrea	2489871	[bs,tc,ec,ni,gf,bl,na,ai,jm,bz,zz,cr,sv,kn,hn,...	citrea
3	aves	pelecaniformes	ardeidae	bubulcus	bubulcus ibis	4408439	[am,gy,bm,ma,aq,lr,hu,ky,au,ly,td,kn,uz,pa,sa,...	ibis
4	aves	strigiformes	strigidae	aegolius	aegolius funereus	5739298	[by,hr,im,ir,md,gg,cn,it,al,mn,cz,us,me,tj,no,...	funereus

	class	order	family	genus	scientific_name	taxonkey	country_codes	species	common_name
920	aves	piciformes	picidae	colaptes	colaptes auratus	6177448	[zz,us,ca]	auratus	northern flicker
921	mammalia	rodentia	cricetidae	microtus	microtus pennsylvanicus	7194068	[us,ca]	pennsylvanicus	NaN
922	mammalia	lagomorpha	leporidae	sylvilagus	sylvilagus transitionalis	2436905	[us]	transitionalis	NaN
923	squamata	NaN	colubridae	regina	regina grahamii	5222697	[us]	grahamii	NaN
924	aves	passeriformes	passerellidae	passerella	passerella iliaca	5788850	[us,ca]	iliaca	red fox-sparrow

	class	order	family	genus	scientific_name	taxonkey	country_codes	species	common_name
3	aves	pelecaniformes	ardeidae	bubulcus	bubulcus ibis	4408439	[am,gy,bm,ma,aq,lr,hu,ky,au,ly,td,kn,uz,pa,sa,...	ibis	cattle egret
5	aves	galliformes	phasianidae	rollulus	rollulus rouloul	2474113	[bn,zz,us,dk,de,br,th,ca,mm,la,my,id,in,ch,sg,...	rouloul	crested partridge
6	aves	podicipediformes	podicipedidae	podilymbus	podilymbus podiceps	7191591	[sr,zz,cu,gt,ni,mx,vi,hn,br,nl,ca,sv,bs,it,bo,...	podiceps	pied-billed grebe
8	aves	columbiformes	columbidae	columba	columba livia	7191490	[dk,gr,ma,jo,gb,es,mk,gt,jp,pt,zw,cl,eg,iq,bg,...	livia	rock dove
9	NaN	NaN	NaN	NaN	NaN	44	[nf,mu,pg,om,mv,ke,nl,sv,fi,gg,vn,je,tf,ie,mz,...	NaN	animal

	Unnamed: 0	class	count	%
0	1	blank	89424	84.70
1	2	bird	4948	4.70
2	3	human	4196	4.00
3	4	western pond turtle	1621	1.50
4	5	mallard	579	0.55

	class	count	%
3	western pond turtle	1621	1.500
14	domestic cattle	41	0.039
15	wild turkey	41	0.039
16	central american agouti	29	0.027
20	white-tailed deer	20	0.019
22	nutria	17	0.016
23	crocodile	16	0.015
26	wild boar	11	0.010
28	common tapeti	7	0.007
30	tome's spiny rat	6	0.006
31	ocellated turkey	5	0.005
34	collared peccary	3	0.003
36	spotted paca	3	0.003
38	madagascar crested ibis	2	0.002
42	red squirrel	1	0.001
43	bushy-tailed woodrat	1	0.001
44	pronghorn	1	0.001
45	eastern red forest rat	1	0.001
47	domestic chicken	1	0.001
49	blood pheasant	1	0.001
50	white-lipped peccary	1	0.001
51	red acouchi	1	0.001
52	desert cottontail	1	0.001
53	plains zebra	1	0.001
54	rufescent tiger-heron	1	0.001
55	owl	1	0.001
56	common wombat	1	0.001
58	bearded pig	1	0.001
60	fossa	1	0.001
61	nine-banded armadillo	1	0.001
62	guenther's dik-dik	1	0.001

	uuid	class	order	family	genus	species	common_name	scientific_name
2806	cae9534f-f302-4229-9e11-b91138333d92	aves	galliformes	phasianidae	gallus	gallus domesticus	domestic chicken	gallus gallus domesticus
1574	710eb57f-77d9-44ca-a1e1-4077f868c0fa	aves	galliformes	phasianidae	ithaginis	cruentus	blood pheasant	ithaginis cruentus
992	466b25f0-a916-432c-823e-394a69391328	aves	galliformes	phasianidae	meleagris	ocellata	ocellated turkey	meleagris ocellata
250	1110460b-7f99-405b-a9b0-65a09ecccca1	aves	pelecaniformes	ardeidae	tigrisoma	lineatum	rufescent tiger-heron	tigrisoma lineatum
2544	b84b365a-b420-4164-85a1-124afc96f1d7	aves	pelecaniformes	threskiornithidae	lophotibis	cristata	madagascar crested ibis	lophotibis cristata

	country_code	common_name	presence
0	AD	domestic cattle<br>wild boar	1
1	AE	domestic chicken<br>domestic cattle	1
2	AF	domestic cattle<br>wild boar	1
3	AG	domestic chicken	1
4	AI	domestic chicken	1

	common_name	state_code
0	bushy-tailed woodrat	[CO, CA, WY, OR, UT, WA, NM, MT, SD, NV, ID, N...
1	central american agouti	[TX]
3	common wombat	[NM, WA]
4	crocodile	[FL]
5	desert cottontail	[CA, AZ, CO, NM, TX, NV, UT, WY, MT, NE, SD, K...
8	fossa	[NE, TX]
9	guenther's dik-dik	[TX]
12	ocellated turkey	[CA, FL, HI, OK]
13	plains zebra	[CA, NM, TX, WA, AK, CO, MA, OH, OR, UT]
14	pronghorn	[WY, CO, NM, UT, AZ, SD, MT, OR, ID, TX, NV, C...
15	red acouchi	[NE, NY, WA]
17	rufescent tiger-heron	[HI]
18	spotted paca	[CO, CA, RI, UT, WA]
19	western pond turtle	[CA, OR, NV]
20	white-lipped peccary	[TX, CO, TX]

Species Mismatch Analysis¶