SQL database with python

Author

Tony Duan

1 Connection with database

Code

# Import seaborn for loading sample datasets
import seaborn as sns
# Import pandas for data manipulation
import pandas as pd
# Import sqlite3 for SQLite database interaction
import sqlite3

# Load the 'tips' dataset from seaborn
tips = sns.load_dataset("tips")
# Load the 'planets' dataset from seaborn
planets=sns.load_dataset('planets')

1.1 create database file pythonsqlite.db and copy tips data and planets data into database

Code

# Import the sqlite3 module
import sqlite3
# Create a connection to the SQLite database file 'pythonsqlite.db'
db = sqlite3.connect('pythonsqlite.db')

# Write the 'tips' DataFrame to a SQL table named 'tips', replacing it if it already exists
tips.to_sql(name='tips', con=db,if_exists='replace')
# Write the 'planets' DataFrame to a SQL table named 'planets', replacing it if it already exists
planets.to_sql(name='planets', con=db,if_exists='replace')

# Close the database connection
db.close()

1.2 check all table in database

Code

# Connect to the SQLite database file 'pythonsqlite.db'
db = sqlite3.connect('pythonsqlite.db')
# Execute an SQL query to select table names from sqlite_master and read into a pandas DataFrame
table = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table';", db)
# Display the DataFrame
table

	name
0	tips
1	planets

2 SQL

2.1 select

Code

# Connect to the SQLite database
db = sqlite3.connect('pythonsqlite.db')
# Define an SQL query to select all columns from the 'tips' table, limiting to 3 rows
sql="select * from tips LIMIT 3;"
# Execute the SQL query and read the result into a pandas DataFrame
table = pd.read_sql_query(sql,db)
# Display the DataFrame
table

	index	total_bill	tip	sex	smoker	day	time	size
0	0	16.99	1.01	Female	No	Sun	Dinner	2
1	1	10.34	1.66	Male	No	Sun	Dinner	3
2	2	21.01	3.50	Male	No	Sun	Dinner	3

2.2 Renaming column

Code

# Define an SQL query to select 'total_bill' and alias it as 'new_total_bill'
sql="select total_bill as new_total_bill from tips"
# Execute the SQL query and read the result into a pandas DataFrame
table = pd.read_sql_query(sql,db)
# Display the first 5 rows of the DataFrame
table.head()

	new_total_bill
0	16.99
1	10.34
2	21.01
3	23.68
4	24.59

2.3 create column

Code

# Define an SQL query to select 'total_bill' as 'new_total_bill' and also the original 'total_bill'
sql="select total_bill as new_total_bill,total_bill from tips"
# Execute the SQL query and read the result into a pandas DataFrame
table = pd.read_sql_query(sql,db)
# Display the first 5 rows of the DataFrame
table.head()

	new_total_bill	total_bill
0	16.99	16.99
1	10.34	10.34
2	21.01	21.01
3	23.68	23.68
4	24.59	24.59

2.4 Filter rows

Code

# Define an SQL query to select all columns from 'tips' where 'sex' is 'Male'
sql="select * from tips where sex='Male'"
# Execute the SQL query and read the result into a pandas DataFrame
table = pd.read_sql_query(sql,db)
# Display the first 5 rows of the DataFrame
table.head()

	index	total_bill	tip	sex	smoker	day	time	size
0	1	10.34	1.66	Male	No	Sun	Dinner	3
1	2	21.01	3.50	Male	No	Sun	Dinner	3
2	3	23.68	3.31	Male	No	Sun	Dinner	2
3	5	25.29	4.71	Male	No	Sun	Dinner	4
4	6	8.77	2.00	Male	No	Sun	Dinner	2

2.4.1 Filters with AND conditions

Code

# Define an SQL query to select all columns from 'tips' where 'sex' is 'Male' AND 'size' is greater than 3
sql="select * from tips where sex='Male' and size>3"
# Execute the SQL query and read the result into a pandas DataFrame
table = pd.read_sql_query(sql,db)
# Display the first 5 rows of the DataFrame
table.head()

	index	total_bill	tip	sex	smoker	day	time	size
0	5	25.29	4.71	Male	No	Sun	Dinner	4
1	7	26.88	3.12	Male	No	Sun	Dinner	4
2	13	18.43	3.00	Male	No	Sun	Dinner	4
3	23	39.42	7.58	Male	No	Sat	Dinner	4
4	25	17.81	2.34	Male	No	Sat	Dinner	4

2.4.2 Filters with or conditions

Code

# Define an SQL query to select all columns from 'tips' where 'sex' is 'Male' OR 'size' is greater than 3
sql="select * from tips where sex='Male' or size>3"
# Execute the SQL query and read the result into a pandas DataFrame
table = pd.read_sql_query(sql,db)
# Display the shape of the DataFrame (number of rows, number of columns)
table.shape

(169, 8)

2.5 Append

2.5.1 append by row

Code

# Define an SQL query to union all rows from 'tips' with itself
sql="select * from tips UNION all select * from tips"
# Execute the SQL query and read the result into a pandas DataFrame
table = pd.read_sql_query(sql,db)
# Display the shape of the DataFrame
table.shape

(488, 8)

2.5.2 append by column

2.5.3 Dropping NA values

2.5.4 keep NA values

2.6 group by

2.6.1 average,min,max,sum

Code

# Define an SQL query to calculate the average, minimum, maximum, and sum of 'total_bill' grouped by 'sex'
sql="select AVG(total_bill),min(total_bill),max(total_bill),sum(total_bill) from tips group by sex"
# Execute the SQL query and read the result into a pandas DataFrame
table = pd.read_sql_query(sql,db)
# Display the DataFrame
table

	AVG(total_bill)	min(total_bill)	max(total_bill)	sum(total_bill)
0	18.056897	3.07	44.30	1570.95
1	20.744076	7.25	50.81	3256.82

2.6.2 count

Code

# Define an SQL query to count records grouped by 'sex'
sql="select sex,count(*) from tips group by 1"
# Execute the SQL query and read the result into a pandas DataFrame
table = pd.read_sql_query(sql,db)
# Display the DataFrame
table

	sex	count(*)
0	Female	87
1	Male	157

2.7 order rows

Code

# Define an SQL query to select all columns from 'tips' and order by 'total_bill' in ascending order
sql="select * from tips order by total_bill"
# Execute the SQL query and read the result into a pandas DataFrame
table = pd.read_sql_query(sql,db)
# Display the first 5 rows of the DataFrame
table.head()

	index	total_bill	tip	sex	smoker	day	time	size
0	67	3.07	1.00	Female	Yes	Sat	Dinner	1
1	92	5.75	1.00	Female	Yes	Fri	Dinner	2
2	111	7.25	1.00	Female	No	Sat	Dinner	1
3	172	7.25	5.15	Male	Yes	Sun	Dinner	2
4	149	7.51	2.00	Male	No	Thur	Lunch	2

2.7.1 Sort in descending order

Code

# Define an SQL query to select all columns from 'tips' and order by 'total_bill' in descending order
sql="select * from tips order by total_bill desc"
# Execute the SQL query and read the result into a pandas DataFrame
table = pd.read_sql_query(sql,db)
# Display the first 5 rows of the DataFrame
table.head()

	index	total_bill	tip	sex	smoker	day	time	size
0	170	50.81	10.00	Male	Yes	Sat	Dinner	3
1	212	48.33	9.00	Male	No	Sat	Dinner	4
2	59	48.27	6.73	Male	No	Sat	Dinner	4
3	156	48.17	5.00	Male	No	Sun	Dinner	6
4	182	45.35	3.50	Male	Yes	Sun	Dinner	3

2.7.2 Arrange by multiple variables

Code

# Define an SQL query to select all columns from 'tips' and order by 'total_bill' then by 'tip'
sql="select * from tips order by total_bill,tip"
# Execute the SQL query and read the result into a pandas DataFrame
table = pd.read_sql_query(sql,db)
# Display the first 5 rows of the DataFrame
table.head()

	index	total_bill	tip	sex	smoker	day	time	size
0	67	3.07	1.00	Female	Yes	Sat	Dinner	1
1	92	5.75	1.00	Female	Yes	Fri	Dinner	2
2	111	7.25	1.00	Female	No	Sat	Dinner	1
3	172	7.25	5.15	Male	Yes	Sun	Dinner	2
4	149	7.51	2.00	Male	No	Thur	Lunch	2

2.8 join

Joining tables is a fundamental operation in SQL, allowing you to combine rows from two or more tables based on a related column between them. Here are common types of joins:

2.8.1 inner_join

An INNER JOIN returns only the rows that have matching values in both tables.

# Create sample DataFrames for demonstration
lhs_df = pd.DataFrame({'id': [1,2,3], 'val': ['lhs.1', 'lhs.2', 'lhs.3']})
rhs_df = pd.DataFrame({'id': [1,2,4], 'val': ['rhs.1', 'rhs.2', 'rhs.3']})

# Perform an INNER JOIN using pandas merge
inner_join_result = pd.merge(lhs_df, rhs_df, on='id', how='inner')
print("Inner Join Result:")
print(inner_join_result)

2.8.2 full join

A FULL OUTER JOIN returns all rows when there is a match in one of the tables. If there are no matches, it returns NULL values for columns from the table that doesn’t have a match.

# Perform a FULL OUTER JOIN using pandas merge
full_join_result = pd.merge(lhs_df, rhs_df, on='id', how='outer')
print("\nFull Outer Join Result:")
print(full_join_result)

2.8.3 left join

A LEFT JOIN (or LEFT OUTER JOIN) returns all rows from the left table, and the matching rows from the right table. If there is no match from the right table, NULL values are used.

# Perform a LEFT JOIN using pandas merge
left_join_result = pd.merge(lhs_df, rhs_df, on='id', how='left')
print("\nLeft Join Result:")
print(left_join_result)

2.8.4 anti join

An ANTI JOIN returns rows from the left table that have no matching rows in the right table.

# Perform an ANTI JOIN (rows in lhs_df not in rhs_df)
anti_join_result = lhs_df[~lhs_df['id'].isin(rhs_df['id'])]
print("\nAnti Join Result (lhs_df not in rhs_df):")
print(anti_join_result)

2.9 Reshape tables

2.9.1 Gather data long(wide to long)

2.9.2 Spread data wide (long to wide)

2.10 string

2.10.1 upper case

2.10.2 lower case

2.10.3 match

2.10.4 concatenation

2.10.5 replace

2.10.6 extract

2.11 date

2.12 create table into database

Code

# Define an SQL query to create a new table named 'new_tips' if it doesn't already exist, copying data from 'tips'
sql="create table if not exists new_tips as select * from tips"
# Execute the SQL query
db.execute(sql)

# Execute an SQL query to select table names from sqlite_master and read into a pandas DataFrame
table = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table';", db)
# Display the DataFrame
table

	name
0	tips
1	planets
2	new_tips

2.13 delete table in database

Code

# Define an SQL query to drop the table named 'new_tips' if it exists
sql="drop table if  exists new_tips"
# Execute the SQL query
db.execute(sql)

# Execute an SQL query to select table names from sqlite_master and read into a pandas DataFrame
table = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table';", db)
# Display the DataFrame
table

	name
0	tips
1	planets

2.14 edit table in database

Code

# Close the database connection
db.close()

3 Using SQL with Pandas

Code

# Import the sqldf function from the pandasql library
from pandasql import sqldf

Code

# Define an SQL query to select the first 5 rows from the 'tips' table
sql="SELECT * FROM tips LIMIT 5;"
# Execute the SQL query on the 'tips' DataFrame and store the result in 'data'
data=sqldf(sql)
# Display the resulting DataFrame
data

	total_bill	tip	sex	smoker	day	time	size
0	16.99	1.01	Female	No	Sun	Dinner	2
1	10.34	1.66	Male	No	Sun	Dinner	3
2	21.01	3.50	Male	No	Sun	Dinner	3
3	23.68	3.31	Male	No	Sun	Dinner	2
4	24.59	3.61	Female	No	Sun	Dinner	4

1 Connection with database

1.1 create database file pythonsqlite.db and copy tips data and planets data into database

1.2 check all table in database

2 SQL

2.1 select

2.2 Renaming column

2.3 create column

2.4 Filter rows

2.4.1 Filters with AND conditions

2.4.2 Filters with or conditions

2.5 Append

2.5.1 append by row

2.5.2 append by column

2.5.3 Dropping NA values

2.5.4 keep NA values

2.6 group by

2.6.1 average,min,max,sum

2.6.2 count

2.7 order rows

2.7.1 Sort in descending order

2.7.2 Arrange by multiple variables

2.8 join

2.8.1 inner_join

2.8.2 full join

2.8.3 left join

2.8.4 anti join

2.9 Reshape tables

2.9.1 Gather data long(wide to long)

2.9.2 Spread data wide (long to wide)

2.10 string

2.10.1 upper case

2.10.2 lower case

2.10.3 match

2.10.4 concatenation

2.10.5 replace

2.10.6 extract

2.11 date

2.12 create table into database

2.13 delete table in database

2.14 edit table in database

3 Using SQL with Pandas

4 reference: