Check number of yellow taxi trip records.
SELECT COUNT(*) "Count" FROM raw_yellow_tripdata;
Explore data categories.
-- observe NULL values
SELECT vendorid, COUNT(*) "Count"
FROM raw_yellow_tripdata
GROUP BY vendorid
ORDER BY 1;
-- observe other categories
SELECT pulocationid, COUNT(*) "Count"
FROM raw_yellow_tripdata
GROUP BY pulocationid
ORDER BY 1;
-- observe NULL values
SELECT payment_type, COUNT(*) "Count"
FROM raw_yellow_tripdata
GROUP BY payment_type
ORDER BY 1;
Explore records with NULL Vendor ID.
-- observe other columns with NULL values
-- passenger_count, ratecodeid, store_and_fwd_flag, payment_type
SELECT *
FROM raw_yellow_tripdata
WHERE vendorid IS NULL
LIMIT 100;
Explore records by time period.
-- tpep_pickup_datetime is defined as STRING
-- observe record counts that falls outside of the time period
SELECT SUBSTR(tpep_pickup_datetime, 1, 7) "Period", COUNT(*) "Total Records"
FROM raw_yellow_tripdata
GROUP BY SUBSTR(tpep_pickup_datetime, 1, 7)
ORDER BY 1;
Count records that falls outside of year 2020.
-- records with incorrect pickup datetime values
SELECT COUNT(*) "Count"
FROM raw_yellow_tripdata
WHERE SUBSTR(tpep_pickup_datetime, 1, 7) NOT LIKE '2020%';
Count records with NULL values (based on Vendor ID) that falls within 2020.
-- Records with NULL categories like Vendor ID
SELECT COUNT(*) "Count"
FROM raw_yellow_tripdata
WHERE vendorid IS NULL
AND SUBSTR(tpep_pickup_datetime, 1, 7) LIKE '2020%';
Count records that falls in the last quarter of 2020, exclude records with missing Vendor ID.
-- Total records in BER months, excluding columns with missing Vendor ID
SELECT COUNT(*) "Count"
FROM raw_yellow_tripdata
WHERE vendorid IS NOT NULL
AND SUBSTR(tpep_pickup_datetime, 1, 7) LIKE '2020-1%';
Join taxi trips data with taxi zone look up table.
-- explore data with lookup information
-- observe column names from lookup tables
SELECT td.*, pu.*, do.*
FROM raw_yellow_tripdata td,
taxi_zone_lookup pu,
taxi_zone_lookup do
WHERE td.pulocationid = pu.locationid AND
td.pulocationid = do.locationid AND
vendorid IS NOT NULL AND
SUBSTR(tpep_pickup_datetime, 1, 7) LIKE '2020-1%'
LIMIT 100;
-- Count total joined records for the last quarter of 2020.
SELECT COUNT(*) "Count"
FROM raw_yellow_tripdata td,
taxi_zone_lookup pu,
taxi_zone_lookup do
WHERE td.pulocationid = pu.locationid AND
td.pulocationid = do.locationid AND
vendorid IS NOT NULL AND
SUBSTR(tpep_pickup_datetime, 1, 7) LIKE '2020-1%';