Carlos Aguni

Highly motivated self-taught IT analyst. Always learning and ready to explore new skills. An eternal apprentice.


Dask Fastparquet Read_Parquet too slow between versions 1.* and 2.*

17 Nov 2022 »

to create db

import fastparquet
import dask
import dask.dataframe as dd
import datetime
import random
import pandas as pd
import os

folder_save = './test006'
start = datetime.datetime(2020,6,1)
colsize = 8
for y in range(400):  # <----------------------- took 1min 6s
    row = []
    for i in range(720):
        start = start + datetime.timedelta(seconds=5)
        row.append([start, *[random.randint(1,1000) for i in range(colsize)]]) 
    df = pd.DataFrame(row)
    df.columns = ['ds', *[chr(65+i) for i in range(colsize)]]
    df['year'] = df['ds'].apply(lambda x: x.timetuple().tm_year)
    df['month'] = df['ds'].apply(lambda x: x.timetuple().tm_mon)
    df['day'] = df['ds'].apply(lambda x: x.timetuple().tm_mday)
    append = {'append': True} if os.path.exists(folder_save+'/_metadata') else {}
    fastparquet.write(**{
        "filename": folder_save,
        "data": df,
        "compression": 'GZIP',
        "file_scheme": 'hive',
        "partition_on": ['year', 'month', 'day'],
        "write_index": False,
        **append
    })

to read

import datetime
import dask
import dask.dataframe as dd

s = datetime.datetime.now().timestamp()
ddf = dd.read_parquet('./test006')
df = ddf.compute()
e = datetime.datetime.now().timestamp()
print('took', e-s, 'secs')  # <-------------------------- took 1min 1s dask==2.20.0
#