DataFrame.to_parquet("filename.gzip", compression='gzip')

fName = os.path.join('Data', 'My_cool_but_large_data.csv')
df = pd.read_csv(fName)

#
# Do all your processing you desire
#

# Save the DataFrame as a Parquet file
fName = os.path.join('Data', 'My_processed_data.parquet')
df.to_parquet(fName)

# Alternativly as a gzipped file
fName = os.path.join('Data', 'My_processed_data.gzip')
df.to_parquet(fName, compression='gzip')


#!pip install fastparquet


import os
import pandas as pd
import numpy as np


# Load the .csv into a DataFrame
df = pd.read_csv(os.path.join('Processed_data', 'Combined_lists.csv'))

# Save the DataFrame as a parquet file
df.to_parquet(os.path.join('Processed_data', 'Test.parquet'))

# Save the DataFrame as a parquet.gzip file
df.to_parquet(os.path.join('Processed_data', 'Test.gzip'), compression='gzip')


def improvement(s1, s2):
    return np.round((s1-s2)/s1 *100, 1)

csv = 47971
par = 25141
gzp = 15778
print('Size reductions')
print(f'csv to parquet:  {improvement(csv, par)}% reduction')
print(f'parquet to gzip: {improvement(par, gzp)}% reduction')
print(f'csv to gzip:     {improvement(csv, gzp)}% reduction')

Size reductions
csv to parquet:  47.6% reduction
parquet to gzip: 37.2% reduction
csv to gzip:     67.1% reduction

import numpy as np
import xarray as xr
import rioxarray as rxr
import os
from glob import glob


def process_modis_data(fName):
    # Open each file as an XArray Dataset using the rioxarray rasterio as the background engine
    modis = xr.open_dataset(fName, engine='rasterio')

    # Drop everything except the land Surface Temperature (LST)
    modis = modis.drop(['Clear_sky_nights', 'Percent_land_in_grid', 'QC_Day', 'Day_view_time', 'Day_view_angl', 'Clear_sky_days', 'LST_Night_CMG', 'QC_Night', 'Night_view_time', 'Night_view_angl'])
    modis = modis.drop(['Emis_20', 'Emis_22', 'Emis_23', 'Emis_29', 'Emis_31', 'Emis_32', 'spatial_ref'])

    # Update the attributes for the file with reprocessing info
    modis.attrs['UNITS'] = 'celsius'
    modis.attrs['REPROCESSING_AUTHOR']='Tim Dunn'
    modis.attrs['REPROCESSING_DATE']='Jan 2022'
    modis.attrs['REPROCESSING_OUTPUT_FORMAT']='netCDF'
    modis.attrs['REPROCESSING_NOTES']='Restricted DataSet to LST_Day_CMG and converted from kelvin to celsius for the sole purpose of CU CRDDS tutorial workshops'

    # Convert degrees kelvin to celsius
    a = modis.to_dataframe()                        # To make this easier for people to understand convert from a XArray DataSet to a DataFrame
    a['LST_Day_CMG'] = a['LST_Day_CMG']-273.15      # COnvert from degrees kelvin to celsius
    b = xr.Dataset.from_dataframe(a)                # To save as a netCDF, the way I need it, convert back from the XArray DataFrame to a XArrays DataSet

    # Build output path & name
    start = modis.attrs['RANGEBEGINNINGDATE'][:7]   # Extract the year and month for use as part of the file name 
    outpath = 'Processed_Data'                      # Output file directory
    fName = fName[9:]                               # Extract the base input file name as the base for the output file name
    name = f'{fName[:7]}_{fName[8:16]}_{start}.nc'  # Build the full output file name
    outname = os.path.join(outpath, name)           # Join the path and file name

    # Save the DataSet to a netCDF file
    b.to_netcdf(outname)                      

    print(outname)                                  # print the completed file path+name as a progress indicator since this will take about an hour to run through all the files.


inpath = os.path.join('Raw_Data', '*.hdf')

for f in glob(inpath):
    process_modis_data(f)


#!pip install netCDF4
#!pip install dask
#!pip install xarray


import glob
import netCDF4
import dask.array as da
import dask.dataframe as dd
import xarray as xr


ds = xr.open_dataset(os.path.join('netCDF_files', '20190801_0000.nc'))

C:\Programs\Anaconda3\lib\site-packages\xarray\conventions.py:521: SerializationWarning: variable 'dewpoint' has multiple fill values {-9999.0, 3.4028235e+38}, decoding all values to NaN.
  new_vars[k] = decode_cf_variable(
C:\Programs\Anaconda3\lib\site-packages\xarray\conventions.py:521: SerializationWarning: variable 'elevation' has multiple fill values {-9999.0, 3.4028235e+38}, decoding all values to NaN.
  new_vars[k] = decode_cf_variable(
C:\Programs\Anaconda3\lib\site-packages\xarray\conventions.py:521: SerializationWarning: variable 'fuelMoisture' has multiple fill values {-9999.0, 3.4028235e+38}, decoding all values to NaN.
  new_vars[k] = decode_cf_variable(
C:\Programs\Anaconda3\lib\site-packages\xarray\conventions.py:521: SerializationWarning: variable 'fuelTemperature' has multiple fill values {-9999.0, 3.4028235e+38}, decoding all values to NaN.
  new_vars[k] = decode_cf_variable(
C:\Programs\Anaconda3\lib\site-packages\xarray\conventions.py:521: SerializationWarning: variable 'latitude' has multiple fill values {-9999.0, 3.4028235e+38}, decoding all values to NaN.
  new_vars[k] = decode_cf_variable(
C:\Programs\Anaconda3\lib\site-packages\xarray\conventions.py:521: SerializationWarning: variable 'longitude' has multiple fill values {-9999.0, 3.4028235e+38}, decoding all values to NaN.
  new_vars[k] = decode_cf_variable(
C:\Programs\Anaconda3\lib\site-packages\xarray\conventions.py:521: SerializationWarning: variable 'observationTime' has multiple fill values {-9999.0, 3.40282346e+38}, decoding all values to NaN.
  new_vars[k] = decode_cf_variable(
C:\Programs\Anaconda3\lib\site-packages\xarray\conventions.py:521: SerializationWarning: variable 'precip1min' has multiple fill values {-9999.0, 3.4028235e+38}, decoding all values to NaN.
  new_vars[k] = decode_cf_variable(
C:\Programs\Anaconda3\lib\site-packages\xarray\conventions.py:521: SerializationWarning: variable 'precipAccum' has multiple fill values {-9999.0, 3.4028235e+38}, decoding all values to NaN.
  new_vars[k] = decode_cf_variable(
C:\Programs\Anaconda3\lib\site-packages\xarray\conventions.py:521: SerializationWarning: variable 'precipIntensity' has multiple fill values {-9999, -32767}, decoding all values to NaN.
  new_vars[k] = decode_cf_variable(
C:\Programs\Anaconda3\lib\site-packages\xarray\conventions.py:521: SerializationWarning: variable 'precipRate' has multiple fill values {-9999.0, 3.4028235e+38}, decoding all values to NaN.
  new_vars[k] = decode_cf_variable(
C:\Programs\Anaconda3\lib\site-packages\xarray\conventions.py:521: SerializationWarning: variable 'precipType' has multiple fill values {-9999, -32767}, decoding all values to NaN.
  new_vars[k] = decode_cf_variable(
C:\Programs\Anaconda3\lib\site-packages\xarray\conventions.py:521: SerializationWarning: variable 'relHumidity' has multiple fill values {-9999.0, 3.4028235e+38}, decoding all values to NaN.
  new_vars[k] = decode_cf_variable(
C:\Programs\Anaconda3\lib\site-packages\xarray\conventions.py:521: SerializationWarning: variable 'reportTime' has multiple fill values {-9999.0, 3.40282346e+38}, decoding all values to NaN.
  new_vars[k] = decode_cf_variable(
C:\Programs\Anaconda3\lib\site-packages\xarray\conventions.py:521: SerializationWarning: variable 'seaLevelPressure' has multiple fill values {-9999.0, 3.4028235e+38}, decoding all values to NaN.
  new_vars[k] = decode_cf_variable(
C:\Programs\Anaconda3\lib\site-packages\xarray\conventions.py:521: SerializationWarning: variable 'skyCovLayerBase' has multiple fill values {-9999.0, 3.4028235e+38}, decoding all values to NaN.
  new_vars[k] = decode_cf_variable(
C:\Programs\Anaconda3\lib\site-packages\xarray\conventions.py:521: SerializationWarning: variable 'soilMoisture' has multiple fill values {-9999.0, 3.4028235e+38}, decoding all values to NaN.
  new_vars[k] = decode_cf_variable(
C:\Programs\Anaconda3\lib\site-packages\xarray\conventions.py:521: SerializationWarning: variable 'soilMoisturePercent' has multiple fill values {-9999.0, 3.4028235e+38}, decoding all values to NaN.
  new_vars[k] = decode_cf_variable(
C:\Programs\Anaconda3\lib\site-packages\xarray\conventions.py:521: SerializationWarning: variable 'soilTemperature' has multiple fill values {-9999.0, 3.4028235e+38}, decoding all values to NaN.
  new_vars[k] = decode_cf_variable(
C:\Programs\Anaconda3\lib\site-packages\xarray\conventions.py:521: SerializationWarning: variable 'solarRadiation' has multiple fill values {-9999.0, 3.4028235e+38}, decoding all values to NaN.
  new_vars[k] = decode_cf_variable(
C:\Programs\Anaconda3\lib\site-packages\xarray\conventions.py:521: SerializationWarning: variable 'stationPressure' has multiple fill values {-9999.0, 3.4028235e+38}, decoding all values to NaN.
  new_vars[k] = decode_cf_variable(
C:\Programs\Anaconda3\lib\site-packages\xarray\conventions.py:521: SerializationWarning: variable 'temperature' has multiple fill values {-9999.0, 3.4028235e+38}, decoding all values to NaN.
  new_vars[k] = decode_cf_variable(
C:\Programs\Anaconda3\lib\site-packages\xarray\conventions.py:521: SerializationWarning: variable 'visibility' has multiple fill values {-9999.0, 3.4028235e+38}, decoding all values to NaN.
  new_vars[k] = decode_cf_variable(
C:\Programs\Anaconda3\lib\site-packages\xarray\conventions.py:521: SerializationWarning: variable 'windDir10' has multiple fill values {-9999.0, 3.4028235e+38}, decoding all values to NaN.
  new_vars[k] = decode_cf_variable(
C:\Programs\Anaconda3\lib\site-packages\xarray\conventions.py:521: SerializationWarning: variable 'windDirMax' has multiple fill values {-9999.0, 3.4028235e+38}, decoding all values to NaN.
  new_vars[k] = decode_cf_variable(
C:\Programs\Anaconda3\lib\site-packages\xarray\conventions.py:521: SerializationWarning: variable 'windDirMax10' has multiple fill values {-9999.0, 3.4028235e+38}, decoding all values to NaN.
  new_vars[k] = decode_cf_variable(
C:\Programs\Anaconda3\lib\site-packages\xarray\conventions.py:521: SerializationWarning: variable 'windGust' has multiple fill values {-9999.0, 3.4028235e+38}, decoding all values to NaN.
  new_vars[k] = decode_cf_variable(
C:\Programs\Anaconda3\lib\site-packages\xarray\conventions.py:521: SerializationWarning: variable 'windGust10' has multiple fill values {-9999.0, 3.4028235e+38}, decoding all values to NaN.
  new_vars[k] = decode_cf_variable(
C:\Programs\Anaconda3\lib\site-packages\xarray\conventions.py:521: SerializationWarning: variable 'windSpeed10' has multiple fill values {-9999.0, 3.4028235e+38}, decoding all values to NaN.
  new_vars[k] = decode_cf_variable(

ds

<xarray.Dataset>
Dimensions:                (recNum: 139821, maxSensor: 2, maxSkyCover: 6,
                            maxStaticIds: 45000)
Dimensions without coordinates: recNum, maxSensor, maxSkyCover, maxStaticIds
Data variables: (12/48)
    dataProvider           (recNum) |S11 ...
    dewpoint               (recNum) float32 ...
    dewpointDD             (recNum) |S1 ...
    elevation              (recNum) float32 ...
    fuelMoisture           (recNum) float32 ...
    fuelTemperature        (recNum) float32 ...
    ...                     ...
    windDirMax10           (recNum) float32 ...
    windDirMaxDD           (recNum) |S1 ...
    windGust               (recNum) float32 ...
    windGust10             (recNum) float32 ...
    windGustDD             (recNum) |S1 ...
    windSpeed10            (recNum) float32 ...
Attributes: (12/109)
    cdlDate:                     20160212
    idVariables:                 providerId,dataProvider
    timeVariables:               observationTime,reportTime,receivedTime
    filePeriod:                  3600
    fileEndOffset:               3599
    Conventions:                 MADIS surface observations, v1.0
    ...                          ...
    ICR_NoBitsSet:               No IC applied
    ICR_Bit1Set:                 Master bit - at least 1 check applied
    ICR_BitiSet:                 IC check # applied
    ICR_LeastSignificantBit:     bit1
    ICR_reference:               IC check #'s defined in IC check table
    NCO:                         4.7.0

[139821 values with dtype=|S11]

[139821 values with dtype=float32]

[139821 values with dtype=|S1]

[139821 values with dtype=float32]

[139821 values with dtype=float32]

[139821 values with dtype=float32]

[139821 values with dtype=float32]

[139821 values with dtype=float32]

array(26864.)


df = ds.to_dataframe()

---------------------------------------------------------------------------
MemoryError                               Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_34844\2300985700.py in <cell line: 1>()
----> 1 df = ds.to_dataframe()

C:\Programs\Anaconda3\lib\site-packages\xarray\core\dataset.py in to_dataframe(self, dim_order)
   5896         ordered_dims = self._normalize_dim_order(dim_order=dim_order)
   5897 
-> 5898         return self._to_dataframe(ordered_dims=ordered_dims)
   5899 
   5900     def _set_sparse_data_from_dataframe(

C:\Programs\Anaconda3\lib\site-packages\xarray\core\dataset.py in _to_dataframe(self, ordered_dims)
   5860     def _to_dataframe(self, ordered_dims: Mapping[Any, int]):
   5861         columns = [k for k in self.variables if k not in self.dims]
-> 5862         data = [
   5863             self._variables[k].set_dims(ordered_dims).values.reshape(-1)
   5864             for k in columns

C:\Programs\Anaconda3\lib\site-packages\xarray\core\dataset.py in <listcomp>(.0)
   5861         columns = [k for k in self.variables if k not in self.dims]
   5862         data = [
-> 5863             self._variables[k].set_dims(ordered_dims).values.reshape(-1)
   5864             for k in columns
   5865         ]

MemoryError: Unable to allocate 773. GiB for an array with shape (139821, 2, 6, 45000) and data type |S11


#Declare variable arrays for each netCDF variable you want to extract and then add to the new netCDF file
lat          = []
lon          = []
stationName  = []
stationID    = []
temp         = []

#Iterate through each netCDF file
for f in glob.glob(os.path.join('netCDF_files', '*.nc')):
    #Open each file with netCDF because xarray wiil not play nice, if at all with this kind of data (it's a non-equal/infinite dimensions issue well known to the xarray devs but no solution coming anytime soon.)
    nc = netCDF4.Dataset(f)

    #Extract the variables desired and save them to a Dask array
    lat         = da.from_array(nc['latitude'])
    lon         = da.from_array(nc['longitude'])
    stationName = da.from_array(netCDF4.chartostring(nc['stationName'][:]))
    stationId   = da.from_array(netCDF4.chartostring(nc['stationId'][:]))
    temperature = da.from_array(nc['temperature'])

    #Concat the arrays to a dask dataframe
    ddf = dd.concat([dd.from_dask_array(c) for c in [lat, lon, stationName, stationId, temperature]], axis = 1)
    
# Now convert to a pandas dataframe since there currently is no way to send it directly to a xarray dataset.
# There are several tickets opened requesting this for both xarray and dask but the dev's are stumped on a direct way to implement 
# this due to a 'several dimensions' issues.
ddf = ddf.compute()

######################################################################################################################
# At this point do any/all other data processing you desire as everything 'should' fit in RAM just fine
######################################################################################################################
#Do stuff

#Add the column header names to the dataframe
ddf.columns = ['latitude', 'longitude', 'stationName', 'stationId', 'temperature']

#Convert the pandas dataframe to a xarray dataset
pdf = ddf.to_xarray()

#Save the xarray dataset to a new netCDF file
#NOTES: This is NOT a CF compliant netCDF as it has no attributes, but you can easily add them
#       This also does not move the metadata over but you can just as easily add it in as well
pdf.to_netcdf(os.path.join('netCDF_files', 'demo_netcdf.nc'), 'w')


nc = netCDF4.Dataset(os.path.join('netCDF_files','demo_netcdf.nc'))

nc

<class 'netCDF4._netCDF4.Dataset'>
root group (NETCDF4 data model, file format HDF5):
    dimensions(sizes): index(137724)
    variables(dimensions): int64 index(index), float32 latitude(index), float32 longitude(index), <class 'str'> stationName(index), <class 'str'> stationId(index), float32 temperature(index)
    groups:


nc.dimensions

{'index': <class 'netCDF4._netCDF4.Dimension'>: name = 'index', size = 137724}


nc.variables

{'index': <class 'netCDF4._netCDF4.Variable'>
 int64 index(index)
 unlimited dimensions: 
 current shape = (137724,)
 filling on, default _FillValue of -9223372036854775806 used,
 'latitude': <class 'netCDF4._netCDF4.Variable'>
 float32 latitude(index)
     _FillValue: nan
 unlimited dimensions: 
 current shape = (137724,)
 filling on,
 'longitude': <class 'netCDF4._netCDF4.Variable'>
 float32 longitude(index)
     _FillValue: nan
 unlimited dimensions: 
 current shape = (137724,)
 filling on,
 'stationName': <class 'netCDF4._netCDF4.Variable'>
 vlen stationName(index)
 vlen data type: <class 'str'>
 unlimited dimensions: 
 current shape = (137724,),
 'stationId': <class 'netCDF4._netCDF4.Variable'>
 vlen stationId(index)
 vlen data type: <class 'str'>
 unlimited dimensions: 
 current shape = (137724,),
 'temperature': <class 'netCDF4._netCDF4.Variable'>
 float32 temperature(index)
     _FillValue: nan
 unlimited dimensions: 
 current shape = (137724,)
 filling on}

Part 3 - Working with Big Data¶

Use Parquet instead of CSV For larger Data!¶

A Quick Snapshot Example¶

A Real Example¶

REMOVE THE '..'¶

Beyond Pandas - XArray, Dask, Vaex, cuDF¶

A Quick Snapshot Example¶

Practical Example¶

MOVE DATA INTO netCDF FOLDER and read/write it to their!!!¶

Where to go from here?¶