import pandas as pd

# Create a 900 x 900 table of integers and store it as a simple Python list of list-rows:

list_lists = [list(range(0,900)) for i in range(0,900)]

# Make a Pandas dataframe out of that same table

df_list_lists = pd.DataFrame(list_lists)

# Note the size difference in memory of these two objects. This is size in bytes

print(list_lists.__sizeof__())
print(df_list_lists.__sizeof__())

7816
6480128


# Update the fourth column of our list of lists

def update_list(list_lists):
    new_list_lists = []
    for row in list_lists:
        new_list_lists.append(row[0:3] + [row[3] * 3] + row[4:])
    print(new_list_lists[0][0:5])
    

print(list_lists[0][0:5])

%timeit -n 1 -r 1 update_list(list_lists)

[0, 1, 2, 3, 4]
[0, 1, 2, 9, 4]
7.01 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


%timeit -n 1 -r 1 df_list_lists[3] = df_list_lists[3].apply(lambda x: x*3)

df_list_lists.head(5)

4.73 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


# We might create a Series from a list:

list_series = pd.Series(["student1", "student2", "student3", "student4"])

list_series

0    student1
1    student2
2    student3
3    student4
dtype: object


named_list_series = pd.Series(["student1", "student2", "student3", "student4"], name="student_name")

named_list_series

0    student1
1    student2
2    student3
3    student4
Name: student_name, dtype: object


named_list_series[0]

'student1'


named_list_series[0:2]

0    student1
1    student2
Name: student_name, dtype: object


year_series = pd.Series([1990, 1991, 1992, 1993])

pop_series = pd.Series([1.5, 1.6, 1.8, 2.0])

population_table = pd.DataFrame({"year":year_series, "pop":pop_series})

population_table


# Loading from a character-delimited file using explicit settings for delimiter, 
# header, data types, and using column names to order the resulting dataframe

df_from_csv = pd.read_csv("water-consumption-nyc-csv-example.csv", delimiter = ",",
                          header = 0, names = ["Year","NYC_Pop", "Consumption","PerCapita_Consumption"],
                          dtype = {"Year":int, "NYC_Pop":int, "Consumption":int, "PerCapita_Consumption":int})

# We can use .head() and .tail() to preview just a portion of a dataframe.
# Pass as an optional parameter the number of rows you wish to see

df_from_csv.head(5)


# Rearranging column order

df_from_csv = df_from_csv[["Year","NYC_Pop", "PerCapita_Consumption", "Consumption"]]

df_from_csv.head(5)


# We create an empty dataframe, then add 1 or more rows, each as dictionary with keys aligned with the columns of empty_df

empty_df = pd.DataFrame(columns = ["Year","NYC_Pop", "PerCapita_Consumption", "Consumption"])

newrows = pd.DataFrame([
            {"Year": 1984,
            "NYC_Pop": 8102100,
            "PerCapita_Consumption": 188,
            "Consumption": 1302
            },
            {"Year": 1985,
            "NYC_Pop": 8902100,
            "PerCapita_Consumption": 176,
            "Consumption": 1203
            }
        ], columns = ["Year","NYC_Pop", "PerCapita_Consumption", "Consumption"])

empty_df = pd.concat([empty_df,newrows], ignore_index=True)
    
empty_df.head(5)


df_from_csv["Year"]

0     1979
1     1980
2     1981
3     1982
4     1983
5     1984
6     1985
7     1986
8     1987
9     1988
10    1989
11    1990
12    1991
13    1992
14    1993
15    1994
16    1995
17    1996
18    1997
19    1998
20    1999
21    2000
22    2001
23    2002
24    2003
25    2004
26    2005
27    2006
28    2007
29    2008
30    2009
31    2010
32    2011
33    2012
34    2013
35    2014
36    2015
37    2016
Name: Year, dtype: int64


# We can grab one or more columns either by column index location, or by column name

sub_df_from_csv = df_from_csv["Year"]

sub_df_from_csv.head(5)

0    1979
1    1980
2    1981
3    1982
4    1983
Name: Year, dtype: int64


# Note that a single column subsetted from a dataframe is a Series

type(sub_df_from_csv)

pandas.core.series.Series


# Extracting multiple columns. Note that we pass multiple column names as a list

sub2_df_from_csv = df_from_csv[["Year", "NYC_Pop"]]

sub2_df_from_csv.head(5)


# Pulling multiple columns gives us a dataframe!

type(sub2_df_from_csv)

pandas.core.frame.DataFrame


sub3_df_from_csv = df_from_csv[2:4]

sub3_df_from_csv


# Add Answer


# First two rows, third and fourth column only

df_from_csv.iloc[0:2, 2:4]


# All rows, second column only

df_from_csv.iloc[:, 1]

0     7102100
1     7071639
2     7089241
3     7109105
4     7181224
5     7234514
6     7274054
7     7319246
8     7342476
9     7353719
10    7344175
11    7335650
12    7374501
13    7428944
14    7506166
15    7570458
16    7633040
17    7697812
18    7773443
19    7858259
20    7947660
21    8008278
22    8024964
23    8041649
24    8058335
25    8075020
26    8091706
27    8108391
28    8125077
29    8141762
30    8158448
31    8175133
32    8175133
33    8336697
34    8405837
35    8491079
36    8550405
37    8537673
Name: NYC_Pop, dtype: int64


# Third row, all columns

df_from_csv.iloc[2:3, :]


# Selecting all rows, but only the Year and NYC_Pop columns:

df_from_csv.loc[:, "Year":"NYC_Pop"].head(5)


# We also have an option to pass two or more column labels, even if non-adjacent, to select columns we want:

df_from_csv.loc[:, ["Year", "Consumption"]].head(5)


# Understand that the first parameter, the "row" label, is actually the column that serves as an index.
# In all of our examples so far, we've been using a Pandas-built numerical index. Thus:

df_from_csv.loc[0, ["Year", "Consumption"]]

Year           1979
Consumption    1512
Name: 0, dtype: int64


# However, keep in mind that we can set one of our columns to be an index, 
# enabling us to use the column values as label

df_newindex = df_from_csv.copy().set_index("Year")

df_newindex.head(5)


df_newindex.loc[1981, :].head(5)

NYC_Pop                  7089241
PerCapita_Consumption        185
Consumption                 1309
Name: 1981, dtype: int64


df_from_csv.loc[df_from_csv.Year < 1983]


# Alternatively, same thing:

df_from_csv.loc[df_from_csv["Year"] < 1983]


# Note that we use | (pipe) and & for "OR" and "AND" Bitwise Booleans, respectively. We MUST use parentheses 
# to separate out each Boolean to be evaluated. This is done to obey the precedence and associativity rules in python (similar to BODMAS
# rules in mathematics). For more information: https://www.geeksforgeeks.org/precedence-and-associativity-of-operators-in-python/#

df_from_csv.loc[(df_from_csv.Year < 1983) | (df_from_csv.Year == 1989)]


# Using more than one AND/OR and added parentheses to clarify

df_from_csv.loc[((df_from_csv.Year < 1983) | (df_from_csv.Year == 1989)) & (df_from_csv.Consumption > 1320)]


df_from_csv.Year < 1983

0      True
1      True
2      True
3      True
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
19    False
20    False
21    False
22    False
23    False
24    False
25    False
26    False
27    False
28    False
29    False
30    False
31    False
32    False
33    False
34    False
35    False
36    False
37    False
Name: Year, dtype: bool


df_from_csv.iloc[0:1, :]


df_from_csv.loc[0, "Consumption"] = 1548

df_from_csv.iloc[0:1, :]


# We can also do quick calculated values across all rows and set the results on a new column

df_from_csv["Consumption_per_10000"] = df_from_csv.Consumption / 10000

df_from_csv.head(5)


x = 1.6
round(x)

2


df_from_csv["Consumption_per_10000"] = df_from_csv["Consumption_per_10000"].apply(round)

df_from_csv.head(5)


# Alternative using a function you write yourself.

def check_if_high(in_val):
    if in_val > 200:
        return "Y"
    else:
        return "N"
    
df_from_csv["Consumption_high"] = df_from_csv["PerCapita_Consumption"].apply(check_if_high)    

df_from_csv.head(5)


# And a final example, using a quick lambda function

df_from_csv["Total_sum"] = df_from_csv["Consumption"].apply(lambda x: x+1000)

df_from_csv.head(5)


# Constructing a second table with a plan to use "Year" as our common key

nyc_pop_density = pd.DataFrame([[1979,8.9],[1980,8.1],[1981,7.4],[1982,7.2],
 [1983,6.5],[1984,6.5],[1985,6.0],[1986,5.8],
[1987,5.5],[1988,5.4]], columns=["pop_year","density_per_5000"])

nyc_water_consumption = df_from_csv.loc[df_from_csv.Year < 1989]

nyc_pop_density


nyc_water_consumption


# Performing a join ("merge") with the year column;
# we'll do a left-hand inner table join, meaning we will keep all records in the water consumption table (lefthand)
# and only records that match from the right-hand table (the pop density). As it happens, there is a 1-1 match

merged_df = pd.merge(nyc_water_consumption, nyc_pop_density, how="left", left_on="Year", right_on="pop_year")

merged_df


# Let's say we have a second dataframe with the same type of data, using the same columns.
# We perform a concatenation in which the second dataframe becomes additional row on the original

additional_df = pd.DataFrame([[1989,7367819, 189, 1456, 0, "Y", 2456],[1990,7406718, 192, 1345, 0, "Y", "2345"]],
                         columns=["Year","NYC_Pop","PerCapita_Consumption",
                                  "Consumption","Consumption_per_10000","Consumption_high","Total_sum"])

additional_df


# The resulting dataframe will preserve the original row indices of the original respective dataframes
# To reset the index, we pass an additional parameter: ignore_index=True 

concat_df = pd.concat([nyc_water_consumption, additional_df], ignore_index=True)

concat_df


# Check its size. This dataframe has 38 rows and 7 columns

df_from_csv.shape

(38, 7)


# Check for nulls; in this case, nulls in any row, any column.

df_from_csv[df_from_csv.isnull().any(axis=1)]


# Check if any given column is unique

df_from_csv.Year.is_unique

True


# Summary statistics

df_from_csv.describe()


# Single column value counts, max, min.

print(nyc_water_consumption.Consumption_high.value_counts())

print(nyc_water_consumption.NYC_Pop.max())

print(nyc_water_consumption.Consumption.min())

N    6
Y    4
Name: Consumption_high, dtype: int64
7353719
1309


df_from_csv[11:14][["NYC_Pop","PerCapita_Consumption"]]

	1	2	3	4	5	6	7	8	9	...	890	891	892	893	894	895	896	897	898	899
0	1	2	9	4	5	6	7	8	9	...	890	891	892	893	894	895	896	897	898	899
1	1	2	9	4	5	6	7	8	9	...	890	891	892	893	894	895	896	897	898	899
2	1	2	9	4	5	6	7	8	9	...	890	891	892	893	894	895	896	897	898	899
3	1	2	9	4	5	6	7	8	9	...	890	891	892	893	894	895	896	897	898	899
4	1	2	9	4	5	6	7	8	9	...	890	891	892	893	894	895	896	897	898	899

	Year	NYC_Pop	PerCapita_Consumption	Consumption	Consumption_per_10000	Total_sum
count	38.000000	3.800000e+01	38.000000	38.000000	38.0	38.000000
mean	1997.500000	7.764553e+06	161.552632	1241.631579	0.0	2241.631579
std	11.113055	4.610189e+05	31.404680	173.980493	0.0	173.980493
min	1979.000000	7.071639e+06	117.000000	996.000000	0.0	1996.000000
25%	1988.250000	7.342901e+06	135.250000	1095.000000	0.0	2095.000000
50%	1997.500000	7.815851e+06	155.500000	1238.500000	0.0	2238.500000
75%	2006.750000	8.120906e+06	189.500000	1378.750000	0.0	2378.750000
max	2016.000000	8.550405e+06	213.000000	1548.000000	0.0	2548.000000

Getting Started with Python Pandas¶

1. Using Pandas...and NOT using Pandas¶

Don't forget: a Pandas dataframe is a special kind of two-dimensional array, and arrays excel at performing matrix-based transformations¶

2. Building a Dataframe: Series¶

The Pandas Series object¶

3. Loading a DataFrame¶

Load from CSV/Excel/TSV, etc.¶

Populating an empty dataframe¶

4. Selecting/filtering rows and columns from a dataframe¶

Subsetting one or more columns¶

Extracting rows using slice notation¶

Challenge¶

Filtering using index location: .iloc()¶

Filtering using index location: .loc()¶

Important! Boolean Tests in Filtering¶

Quick note: deprecated versions¶

5. Modifying values in a dataframe¶

Direct setting of new value on an existing dataframe¶

A very helpful approach: dataframe.apply()¶

6. Table joins and table concatenation¶

7. Summary statistics and data quick views¶

8. More help¶

Challenge Answer¶

	1	2	3	4	5	6	7	8	9	...	890	891	892	893	894	895	896	897	898	899
0	1	2	9	4	5	6	7	8	9	...	890	891	892	893	894	895	896	897	898	899
1	1	2	9	4	5	6	7	8	9	...	890	891	892	893	894	895	896	897	898	899
2	1	2	9	4	5	6	7	8	9	...	890	891	892	893	894	895	896	897	898	899
3	1	2	9	4	5	6	7	8	9	...	890	891	892	893	894	895	896	897	898	899
4	1	2	9	4	5	6	7	8	9	...	890	891	892	893	894	895	896	897	898	899

	Year	NYC_Pop	Consumption	PerCapita_Consumption
0	1979	7102100	1512	213
1	1980	7071639	1506	213
2	1981	7089241	1309	185
3	1982	7109105	1382	194
4	1983	7181224	1424	198

	Year	NYC_Pop	PerCapita_Consumption	Consumption	Consumption_per_10000
0	1979	7102100	213	1548	0.1548
1	1980	7071639	213	1506	0.1506
2	1981	7089241	185	1309	0.1309
3	1982	7109105	194	1382	0.1382
4	1983	7181224	198	1424	0.1424

	pop_year	density_per_5000
0	1979	8.9
1	1980	8.1
2	1981	7.4
3	1982	7.2
4	1983	6.5
5	1984	6.5
6	1985	6.0
7	1986	5.8
8	1987	5.5
9	1988	5.4

	Year	NYC_Pop	PerCapita_Consumption	Consumption	Consumption_per_10000	Consumption_high	Total_sum
0	1989	7367819	189	1456	0	Y	2456
1	1990	7406718	192	1345	0	Y	2345

	1	2	3	4	5	6	7	8	9	...	890	891	892	893	894	895	896	897	898	899
0	1	2	9	4	5	6	7	8	9	...	890	891	892	893	894	895	896	897	898	899
1	1	2	9	4	5	6	7	8	9	...	890	891	892	893	894	895	896	897	898	899
2	1	2	9	4	5	6	7	8	9	...	890	891	892	893	894	895	896	897	898	899
3	1	2	9	4	5	6	7	8	9	...	890	891	892	893	894	895	896	897	898	899
4	1	2	9	4	5	6	7	8	9	...	890	891	892	893	894	895	896	897	898	899

	1	2	3	4	5	6	7	8	9	...	890	891	892	893	894	895	896	897	898	899
0	1	2	9	4	5	6	7	8	9	...	890	891	892	893	894	895	896	897	898	899
1	1	2	9	4	5	6	7	8	9	...	890	891	892	893	894	895	896	897	898	899
2	1	2	9	4	5	6	7	8	9	...	890	891	892	893	894	895	896	897	898	899
3	1	2	9	4	5	6	7	8	9	...	890	891	892	893	894	895	896	897	898	899
4	1	2	9	4	5	6	7	8	9	...	890	891	892	893	894	895	896	897	898	899