Handling Missing Values¶
# import pandas
import pandas as pd
# read ufo data
ufo = pd.read_csv("http://bit.ly/uforeports")
# last 5 rows
ufo.tail()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
18236 | Grant Park | NaN | TRIANGLE | IL | 12/31/2000 23:00 |
18237 | Spirit Lake | NaN | DISK | IA | 12/31/2000 23:00 |
18238 | Eagle River | NaN | NaN | WI | 12/31/2000 23:45 |
18239 | Eagle River | RED | LIGHT | WI | 12/31/2000 23:45 |
18240 | Ybor | NaN | OVAL | FL | 12/31/2000 23:59 |
# check missing values
ufo.isnull()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
0 | False | True | False | False | False |
1 | False | True | False | False | False |
2 | False | True | False | False | False |
3 | False | True | False | False | False |
4 | False | True | False | False | False |
... | ... | ... | ... | ... | ... |
18236 | False | True | False | False | False |
18237 | False | True | False | False | False |
18238 | False | True | True | False | False |
18239 | False | False | False | False | False |
18240 | False | True | False | False | False |
18241 rows × 5 columns
Note
True: Missing
False: Not Missing
# using notnull()
ufo.notnull()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
0 | True | False | True | True | True |
1 | True | False | True | True | True |
2 | True | False | True | True | True |
3 | True | False | True | True | True |
4 | True | False | True | True | True |
... | ... | ... | ... | ... | ... |
18236 | True | False | True | True | True |
18237 | True | False | True | True | True |
18238 | True | False | False | True | True |
18239 | True | True | True | True | True |
18240 | True | False | True | True | True |
18241 rows × 5 columns
Note
axis = 0: Rows
axis = 1: Columns
# sum of missing values: by default axis = 0
ufo.isnull().sum()
City 25
Colors Reported 15359
Shape Reported 2644
State 0
Time 0
dtype: int64
# Let's create a series
pd.Series([True, False, True]).sum()
2
# filtering using isnull()
ufo[ufo.City.isnull()]
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
21 | NaN | NaN | NaN | LA | 8/15/1943 0:00 |
22 | NaN | NaN | LIGHT | LA | 8/15/1943 0:00 |
204 | NaN | NaN | DISK | CA | 7/15/1952 12:30 |
241 | NaN | BLUE | DISK | MT | 7/4/1953 14:00 |
613 | NaN | NaN | DISK | NV | 7/1/1960 12:00 |
1877 | NaN | YELLOW | CIRCLE | AZ | 8/15/1969 1:00 |
2013 | NaN | NaN | NaN | NH | 8/1/1970 9:30 |
2546 | NaN | NaN | FIREBALL | OH | 10/25/1973 23:30 |
3123 | NaN | RED | TRIANGLE | WV | 11/25/1975 23:00 |
4736 | NaN | NaN | SPHERE | CA | 6/23/1982 23:00 |
5269 | NaN | NaN | NaN | AZ | 6/30/1985 21:30 |
6735 | NaN | NaN | FORMATION | TX | 4/1/1992 2:00 |
7208 | NaN | NaN | CIRCLE | MI | 10/4/1993 17:30 |
8828 | NaN | NaN | TRIANGLE | WA | 10/30/1995 21:30 |
8967 | NaN | NaN | VARIOUS | CA | 12/8/1995 18:00 |
9273 | NaN | NaN | TRIANGLE | OH | 5/1/1996 3:00 |
9388 | NaN | NaN | OVAL | CA | 6/12/1996 12:00 |
9587 | NaN | NaN | EGG | FL | 8/24/1996 15:00 |
10399 | NaN | NaN | TRIANGLE | IL | 6/15/1997 23:00 |
11625 | NaN | NaN | CIRCLE | TX | 6/7/1998 7:00 |
12441 | NaN | RED | FIREBALL | WA | 10/26/1998 17:58 |
15767 | NaN | NaN | RECTANGLE | NV | 1/21/2000 11:30 |
15812 | NaN | NaN | LIGHT | NV | 2/2/2000 3:00 |
16054 | NaN | GREEN | NaN | FL | 3/11/2000 3:30 |
16608 | NaN | NaN | SPHERE | NY | 6/15/2000 15:00 |
# Check specific column
ufo.City.isnull().sum()
25
Drop Missing Values¶
# shape
ufo.shape
(18241, 5)
# drop missing: drop row contains missing values
# it is inplace = False
ufo.dropna(how='any').shape
(2486, 5)
# how=all
ufo.dropna(how='all').shape
(18241, 5)
# subset: any
ufo.dropna(subset=['City', 'Shape Reported'], how='any').shape
(15576, 5)
# subset: all
ufo.dropna(subset=['City', 'Shape Reported'], how='all').shape
(18237, 5)
Filling Missing Values¶
# value counts: by default drop = True
ufo["Shape Reported"].value_counts()
LIGHT 2803
DISK 2122
TRIANGLE 1889
OTHER 1402
CIRCLE 1365
SPHERE 1054
FIREBALL 1039
OVAL 845
CIGAR 617
FORMATION 434
VARIOUS 333
RECTANGLE 303
CYLINDER 294
CHEVRON 248
DIAMOND 234
EGG 197
FLASH 188
TEARDROP 119
CONE 60
CROSS 36
DELTA 7
ROUND 2
CRESCENT 2
PYRAMID 1
HEXAGON 1
DOME 1
FLARE 1
Name: Shape Reported, dtype: int64
# value counts: false
ufo["Shape Reported"].value_counts(dropna=False)
LIGHT 2803
NaN 2644
DISK 2122
TRIANGLE 1889
OTHER 1402
CIRCLE 1365
SPHERE 1054
FIREBALL 1039
OVAL 845
CIGAR 617
FORMATION 434
VARIOUS 333
RECTANGLE 303
CYLINDER 294
CHEVRON 248
DIAMOND 234
EGG 197
FLASH 188
TEARDROP 119
CONE 60
CROSS 36
DELTA 7
CRESCENT 2
ROUND 2
PYRAMID 1
DOME 1
FLARE 1
HEXAGON 1
Name: Shape Reported, dtype: int64
# fillna()
ufo["Shape Reported"].fillna(value="VARIOUS", inplace=True)
# now take a look
ufo["Shape Reported"].value_counts()
VARIOUS 2977
LIGHT 2803
DISK 2122
TRIANGLE 1889
OTHER 1402
CIRCLE 1365
SPHERE 1054
FIREBALL 1039
OVAL 845
CIGAR 617
FORMATION 434
RECTANGLE 303
CYLINDER 294
CHEVRON 248
DIAMOND 234
EGG 197
FLASH 188
TEARDROP 119
CONE 60
CROSS 36
DELTA 7
CRESCENT 2
ROUND 2
PYRAMID 1
HEXAGON 1
DOME 1
FLARE 1
Name: Shape Reported, dtype: int64