Reading Data into Pandas

# conventional way to import pandas
import pandas as pd 

Read CSV

# read data from csv file 
corona = pd.read_csv("../data/covid-19_cleaned_data.csv")
# Examine first few rows 
corona.head() 
Province/State Country/Region Lat Long Date Confirmed Deaths Recovered
0 Thailand Thailand 15.0000 101.0000 2020-01-22 2 0 0
1 Japan Japan 36.0000 138.0000 2020-01-22 2 0 0
2 Singapore Singapore 1.2833 103.8333 2020-01-22 0 0 0
3 Nepal Nepal 28.1667 84.2500 2020-01-22 0 0 0
4 Malaysia Malaysia 2.5000 112.5000 2020-01-22 0 0 0

Read Excel Sheet

# read data from excel file 
movies = pd.read_excel("../data/movies.xls")
# examine first few rows 
movies.head() 
Title Year Genres Language Country Content Rating Duration Aspect Ratio Budget Gross Earnings ... Facebook Likes - Actor 1 Facebook Likes - Actor 2 Facebook Likes - Actor 3 Facebook Likes - cast Total Facebook likes - Movie Facenumber in posters User Votes Reviews by Users Reviews by Crtiics IMDB Score
0 Intolerance: Love's Struggle Throughout the Ages 1916 Drama|History|War NaN USA Not Rated 123 1.33 385907.0 NaN ... 436 22 9.0 481 691 1 10718 88 69.0 8.0
1 Over the Hill to the Poorhouse 1920 Crime|Drama NaN USA NaN 110 1.33 100000.0 3000000.0 ... 2 2 0.0 4 0 1 5 1 1.0 4.8
2 The Big Parade 1925 Drama|Romance|War NaN USA Not Rated 151 1.33 245000.0 NaN ... 81 12 6.0 108 226 0 4849 45 48.0 8.3
3 Metropolis 1927 Drama|Sci-Fi German Germany Not Rated 145 1.33 6000000.0 26435.0 ... 136 23 18.0 203 12000 1 111841 413 260.0 8.3
4 Pandora's Box 1929 Crime|Drama|Romance German Germany Not Rated 110 1.33 NaN 9950.0 ... 426 20 3.0 455 926 1 7431 84 71.0 8.0

5 rows × 25 columns

Read Multiple Excel Sheets

import xlrd 
# Import xlsx file and store each sheet in to a df list
xl_file = pd.ExcelFile("../data/data.xls",)
# Dictionary comprehension
dfs = {sheet_name: xl_file.parse(sheet_name) for sheet_name in xl_file.sheet_names}
# Data from each sheet can be accessed via key
keylist = list(dfs.keys())
# Examine the sheet name 
keylist[1:10]
['2020-03-13-03-30',
 '2020-03-13-00-00',
 '2020-03-12-22-00',
 '2020-03-12-21-30',
 '2020-03-12-21-00',
 '2020-03-12-20-00',
 '2020-03-12-18-30',
 '2020-03-12-17-00',
 '2020-03-12-15-30']
# Accessing first sheet
dfs[keylist[0]]
Province/State Country/Region Last Update Confirmed Deaths Recovered
0 Hubei Mainland China 3/13/2020 06:00 67786 3062 51553
1 Guangdong Mainland China 3/13/2020 06:00 1356 8 1296
2 Zhejiang Mainland China 3/13/2020 06:00 1215 1 1209
3 Shandong Mainland China 3/13/2020 06:00 760 7 739
4 Henan Mainland China 3/13/2020 06:00 1273 22 1249
... ... ... ... ... ... ...
216 NaN Mongolia 3/13/2020 06:00 1 0 0
217 NaN St. Barth 3/13/2020 06:00 1 0 0
218 NaN St. Vincent Grenadines 3/13/2020 06:00 1 0 0
219 NaN Togo 3/13/2020 06:00 1 0 0
220 NaN Trinidad and Tobago 3/13/2020 06:00 1 0 0

221 rows × 6 columns

From URL

# read a dataset of Chipotle orders directly from a URL and store the results in a DataFrame 
orders = pd.read_table('http://bit.ly/chiporders')
# examine the first 5 rows 
orders.head()
order_id quantity item_name choice_description item_price
0 1 1 Chips and Fresh Tomato Salsa NaN $2.39
1 1 1 Izze [Clementine] $3.39
2 1 1 Nantucket Nectar [Apple] $3.39
3 1 1 Chips and Tomatillo-Green Chili Salsa NaN $2.39
4 2 2 Chicken Bowl [Tomatillo-Red Chili Salsa (Hot), [Black Beans... $16.98
# examine the last 5 rows 
orders.tail()
order_id quantity item_name choice_description item_price
4617 1833 1 Steak Burrito [Fresh Tomato Salsa, [Rice, Black Beans, Sour ... $11.75
4618 1833 1 Steak Burrito [Fresh Tomato Salsa, [Rice, Sour Cream, Cheese... $11.75
4619 1834 1 Chicken Salad Bowl [Fresh Tomato Salsa, [Fajita Vegetables, Pinto... $11.25
4620 1834 1 Chicken Salad Bowl [Fresh Tomato Salsa, [Fajita Vegetables, Lettu... $8.75
4621 1834 1 Chicken Salad Bowl [Fresh Tomato Salsa, [Fajita Vegetables, Pinto... $8.75
# examine the first `n` number of rows
orders.head(10)
order_id quantity item_name choice_description item_price
0 1 1 Chips and Fresh Tomato Salsa NaN $2.39
1 1 1 Izze [Clementine] $3.39
2 1 1 Nantucket Nectar [Apple] $3.39
3 1 1 Chips and Tomatillo-Green Chili Salsa NaN $2.39
4 2 2 Chicken Bowl [Tomatillo-Red Chili Salsa (Hot), [Black Beans... $16.98
5 3 1 Chicken Bowl [Fresh Tomato Salsa (Mild), [Rice, Cheese, Sou... $10.98
6 3 1 Side of Chips NaN $1.69
7 4 1 Steak Burrito [Tomatillo Red Chili Salsa, [Fajita Vegetables... $11.75
8 4 1 Steak Soft Tacos [Tomatillo Green Chili Salsa, [Pinto Beans, Ch... $9.25
9 5 1 Steak Burrito [Fresh Tomato Salsa, [Rice, Black Beans, Pinto... $9.25
# examine the last `n` number of rows
orders.tail(10)
order_id quantity item_name choice_description item_price
4612 1831 1 Carnitas Bowl [Fresh Tomato Salsa, [Fajita Vegetables, Rice,... $9.25
4613 1831 1 Chips NaN $2.15
4614 1831 1 Bottled Water NaN $1.50
4615 1832 1 Chicken Soft Tacos [Fresh Tomato Salsa, [Rice, Cheese, Sour Cream]] $8.75
4616 1832 1 Chips and Guacamole NaN $4.45
4617 1833 1 Steak Burrito [Fresh Tomato Salsa, [Rice, Black Beans, Sour ... $11.75
4618 1833 1 Steak Burrito [Fresh Tomato Salsa, [Rice, Sour Cream, Cheese... $11.75
4619 1834 1 Chicken Salad Bowl [Fresh Tomato Salsa, [Fajita Vegetables, Pinto... $11.25
4620 1834 1 Chicken Salad Bowl [Fresh Tomato Salsa, [Fajita Vegetables, Lettu... $8.75
4621 1834 1 Chicken Salad Bowl [Fresh Tomato Salsa, [Fajita Vegetables, Pinto... $8.75

Modify Dataset

# read a dataset of movie reviewers (modifying the default parameter values for read_table)
users = pd.read_table('http://bit.ly//movieusers')
# examine the first 5 rows 
users.head()
1|24|M|technician|85711
0 2|53|F|other|94043
1 3|23|M|writer|32067
2 4|24|M|technician|43537
3 5|33|F|other|15213
4 6|42|M|executive|98101
# DataFrame looks ugly. let's modify the default parameter for read_table 
user_cols = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
users = pd.read_table('http://bit.ly//movieusers', sep='|' , header=None, names=user_cols)
# now take a look at modified dataset
users.head()
user_id age gender occupation zip_code
0 1 24 M technician 85711
1 2 53 F other 94043
2 3 23 M writer 32067
3 4 24 M technician 43537
4 5 33 F other 15213

Read Biological Data(.txt)

# read text/csv data into pandas 
chrom = pd.read_csv("../data/Encode_HMM_data.txt", delimiter="\t", header=None)
# Examine first few rows 
chrom.head()
0 1 2 3 4 5 6 7 8
0 chr1 10000 10600 15_Repetitive/CNV 0 . 10000 10600 245,245,245
1 chr1 10600 11137 13_Heterochrom/lo 0 . 10600 11137 245,245,245
2 chr1 11137 11737 8_Insulator 0 . 11137 11737 10,190,254
3 chr1 11737 11937 11_Weak_Txn 0 . 11737 11937 153,255,102
4 chr1 11937 12137 7_Weak_Enhancer 0 . 11937 12137 255,252,4
# it's not much better to see. so we have to modify this dataset
cols_name = ['chrom', 'start', 'stop', 'type']
chrom = pd.read_csv("../data/Encode_HMM_data.txt", delimiter="\t", header=None, names=cols_name)
# now examine first few rows 
chrom.head()
chrom start stop type
chr1 10000 10600 15_Repetitive/CNV 0 . 10000 10600 245,245,245
10600 11137 13_Heterochrom/lo 0 . 10600 11137 245,245,245
11137 11737 8_Insulator 0 . 11137 11737 10,190,254
11737 11937 11_Weak_Txn 0 . 11737 11937 153,255,102
11937 12137 7_Weak_Enhancer 0 . 11937 12137 255,252,4

Read Biological Data(.tsv)

pokemon = pd.read_csv("../data/pokemon.tsv", sep="\t")
pokemon.head() 
Number Name Type_1 Type_2 Total HP Attack Defense Sp_Atk Sp_Def ... Color hasGender Pr_Male Egg_Group_1 Egg_Group_2 hasMegaEvolution Height_m Weight_kg Catch_Rate Body_Style
0 1 Bulbasaur Grass Poison 318 45 49 49 65 65 ... Green True 0.875 Monster Grass False 0.71 6.9 45 quadruped
1 2 Ivysaur Grass Poison 405 60 62 63 80 80 ... Green True 0.875 Monster Grass False 0.99 13.0 45 quadruped
2 3 Venusaur Grass Poison 525 80 82 83 100 100 ... Green True 0.875 Monster Grass True 2.01 100.0 45 quadruped
3 4 Charmander Fire NaN 309 39 52 43 60 50 ... Red True 0.875 Monster Dragon False 0.61 8.5 45 bipedal_tailed
4 5 Charmeleon Fire NaN 405 58 64 58 80 65 ... Red True 0.875 Monster Dragon False 1.09 19.0 45 bipedal_tailed

5 rows × 23 columns

Read HTML Data

# Read HTML data from web 
url = 'https://www.fdic.gov/bank/individual/failed/banklist.html'
data = pd.io.html.read_html(url)
# Check type 
type(data)
list
# access data 
data[0]
Bank Name City ST CERT Acquiring Institution Closing Date
0 The First State Bank Barboursville WV 14361 MVB Bank, Inc. April 3, 2020
1 Ericson State Bank Ericson NE 18265 Farmers and Merchants Bank February 14, 2020
2 City National Bank of New Jersey Newark NJ 21111 Industrial Bank November 1, 2019
3 Resolute Bank Maumee OH 58317 Buckeye State Bank October 25, 2019
4 Louisa Community Bank Louisa KY 58112 Kentucky Farmers Bank Corporation October 25, 2019
... ... ... ... ... ... ...
556 Superior Bank, FSB Hinsdale IL 32646 Superior Federal, FSB July 27, 2001
557 Malta National Bank Malta OH 6629 North Valley Bank May 3, 2001
558 First Alliance Bank & Trust Co. Manchester NH 34264 Southern New Hampshire Bank & Trust February 2, 2001
559 National State Bank of Metropolis Metropolis IL 3815 Banterra Bank of Marion December 14, 2000
560 Bank of Honolulu Honolulu HI 21029 Bank of the Orient October 13, 2000

561 rows × 6 columns