import dask
import dask.dataframe as dd
import pandas as pd
import matplotlib.pyplot as plt
import fastparquet
fastparquet.__version__
df = pd.read_pickle("dummy_data2.pkl").drop_duplicates()
df[['ds', 'J']].set_index('ds').plot(figsize=(17,6))
df["year"] = df["ds"].apply(lambda x: x.timetuple().tm_year)
df["month"] = df["ds"].apply(lambda x: x.timetuple().tm_mon)
df["day"] = df["ds"].apply(lambda x: x.timetuple().tm_mday)
df.head()
#df = df.set_index("ds")
len(df)
#shuffle
import math
chunk = 10
datashuffle = df.sort_values('ds').to_dict('r')
#random.shuffle(datashuffle)
size = math.ceil(len(datashuffle)/chunk)
dfs = [pd.DataFrame(datashuffle[i*size if i == 0 else (i*size)-3000:size*(i+1)]) for i in range(chunk)]
for i in range(chunk):
dfs[i].to_pickle(f"p2_{i}.pkl")
[len(i) for i in dfs]
fastparquet.write(
filename="./data3",
data=dfs[0],
compression='GZIP',
file_scheme='hive',
#open_with=myopen,
partition_on=['year', 'month', 'day'],
write_index=False,
#mkdirs= lambda x: True # for s3fs
)
dfs[0].sort_values("ds").to_parquet(
fname="./data3/",
compression='GZIP',
#compression='',
engine='fastparquet',
#append=True,
partition_cols=['year', 'month', 'day'],
index=False
)
!rm -rf data3
def plot():
df2 = pd.read_parquet("./data3", engine='pyarrow').sort_values("ds")
fullds = pd.date_range(start='1/1/2020', end='1/7/2020')
fig, ax = plt.subplots()
fig.set_size_inches(17,6)
ax.plot(fullds.values, [0]*len(fullds), alpha=0)
ax.plot(df2["ds"].values, df2["A"].values)
def append(part):
toadd = pd.read_pickle(f"p2_{part}.pkl")
filters = [('ds', '>', pd.Timestamp(toadd.iloc[0]['ds'].replace(tzinfo=None))), ('ds', '<', pd.Timestamp(toadd.iloc[-1]['ds'].replace(tzinfo=None)))]
ddf = dd.read_parquet("./data3", columns=['ds'], filters=filters, index=False)
dup = ddf['ds'].compute()
toadd_new = toadd[~toadd["ds"].isin(dup.tolist())]
print("dup", len(toadd)-len(toadd_new))
toadd_new.to_parquet(
fname="./data3/",
compression='GZIP',
#compression='',
engine='fastparquet',
append=True,
partition_cols=['year', 'month', 'day'],
index=False
)
plot()
append(1)
plot()
append(9)
plot()
append(7)
plot()
append(5)
plot()
append(2)
append(3)
append(4)
append(6)
append(8)
plot()