In [25]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import datetime
In [ ]:
 
In [6]:
html = requests.get('https://crashlaker.github.io/posts', verify=False).text
/opt/conda/lib/python3.7/site-packages/urllib3/connectionpool.py:847: InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings
  InsecureRequestWarning)
In [7]:
soup = BeautifulSoup(html, 'html.parser')
In [17]:
row = []
for tr in soup.find('table').find_all('tr'):
    #print(tr)
    col = []
    for td in tr.find_all('td'):
        col.append(td.text.strip())
    row.append(col)
In [21]:
cols = row[0]
df = pd.DataFrame(row[1:])
df.columns = cols
In [22]:
df
Out[22]:
Date Title
0 15 May 2022 SRE Lead
1 13 May 2022 MinIO Python
2 11 May 2022 Remove Contains zfs_member file system wipefs
3 09 May 2022 FluentD Study
4 08 May 2022 MinIO Study
... ... ...
523 21 April 2019 Practicing around D3.js over Appdynamics
524 28 May 2018 Install MediaWiki 1.30 + VisualEditor 1.30 on ...
525 08 May 2018 Environment modules cache issues
526 03 May 2018 Spack custom module names
527 30 April 2018 Hello World

528 rows × 2 columns

In [30]:
df['Date'] = df['Date'].apply(lambda x: datetime.datetime.strptime(x, "%d %B %Y"))
In [31]:
df
Out[31]:
Date Title
0 2022-05-15 SRE Lead
1 2022-05-13 MinIO Python
2 2022-05-11 Remove Contains zfs_member file system wipefs
3 2022-05-09 FluentD Study
4 2022-05-08 MinIO Study
... ... ...
523 2019-04-21 Practicing around D3.js over Appdynamics
524 2018-05-28 Install MediaWiki 1.30 + VisualEditor 1.30 on ...
525 2018-05-08 Environment modules cache issues
526 2018-05-03 Spack custom module names
527 2018-04-30 Hello World

528 rows × 2 columns

In [34]:
dir(df['Date'].iloc[0])
Out[34]:
['__add__',
 '__array_priority__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__radd__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__rsub__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__sub__',
 '__subclasshook__',
 '__weakref__',
 '_date_attributes',
 '_date_repr',
 '_get_date_name_field',
 '_get_start_end_field',
 '_has_time_component',
 '_repr_base',
 '_round',
 '_short_repr',
 '_time_repr',
 'asm8',
 'astimezone',
 'ceil',
 'combine',
 'ctime',
 'date',
 'day',
 'day_name',
 'dayofweek',
 'dayofyear',
 'days_in_month',
 'daysinmonth',
 'dst',
 'floor',
 'fold',
 'freq',
 'freqstr',
 'fromisoformat',
 'fromordinal',
 'fromtimestamp',
 'hour',
 'is_leap_year',
 'is_month_end',
 'is_month_start',
 'is_quarter_end',
 'is_quarter_start',
 'is_year_end',
 'is_year_start',
 'isocalendar',
 'isoformat',
 'isoweekday',
 'max',
 'microsecond',
 'min',
 'minute',
 'month',
 'month_name',
 'nanosecond',
 'normalize',
 'now',
 'quarter',
 'replace',
 'resolution',
 'round',
 'second',
 'strftime',
 'strptime',
 'time',
 'timestamp',
 'timetuple',
 'timetz',
 'to_datetime64',
 'to_julian_date',
 'to_numpy',
 'to_period',
 'to_pydatetime',
 'today',
 'toordinal',
 'tz',
 'tz_convert',
 'tz_localize',
 'tzinfo',
 'tzname',
 'utcfromtimestamp',
 'utcnow',
 'utcoffset',
 'utctimetuple',
 'value',
 'week',
 'weekday',
 'weekofyear',
 'year']
In [35]:
df['YearMon'] = df['Date'].apply(lambda x: datetime.datetime(x.year, x.month, 1))
In [47]:
import matplotlib.dates as mdates
In [61]:
df2.index[0].year
Out[61]:
2018
In [ ]:
 
In [66]:
df2 = df[['YearMon', 'Title']].rename(columns={'Title': 'Count'}).groupby('YearMon').count()
ax = df2.plot(kind='bar', figsize=(17,6))
#ax.set_title('asdf')
for label in ax.get_xmajorticklabels() + ax.get_xmajorticklabels():
    label.set_rotation(30)
    label.set_horizontalalignment("right")
# Set major ticks and tick labels
# https://stackoverflow.com/questions/33743394/matplotlib-dateformatter-for-axis-label-not-working
ax.set_xticks(range(df2.index.size))
ax.set_xticklabels([ts.strftime('%b\n%Y') if ts.year != df2.index[idx-1].year
                    else ts.strftime('%b') for idx, ts in enumerate(df2.index)])
ax.figure.autofmt_xdate(rotation=0, ha='center');
In [ ]:
 
In [44]:
df['Year'] = df['Date'].apply(lambda x: datetime.datetime(x.year, 1, 1))
In [45]:
ax = df[['Year', 'Title']].rename(columns={'Title': 'Count'}).groupby('Year').count().plot(kind='bar', figsize=(17,6))
In [ ]: