Setup Environment

Set ASIC Data URL and analysis libraries

import requests
import pytz
from     datetime   import date, datetime
import csv
import io
import os
import requests
import json
import random
import pandas as pd
import re
from wordcloud import WordCloud
from wordcloud import STOPWORDS
import matplotlib.pyplot as plt
import shutil
import plotly.express as px

## Set download URL
url = 'https://data.gov.au/data/dataset/7b8656f9-606d-4337-af29-66b89b2eeefb/resource/cb7e4eb5-ed46-4c6c-97a0-4532f4479b7d/download/company_202104.zip'

Download ASIC Data File

And Unzip the downloaded file

## Setup function to stream file to disk
def download_file(url):
    local_filename = url.split('/')[-1]
    with requests.get(url, stream=True) as r:
        with open(local_filename, 'wb') as f:
            shutil.copyfileobj(r.raw, f)

    return local_filename

downloaded_file = download_file(url)

import zipfile
with zipfile.ZipFile(downloaded_file,"r") as zip_ref:
    zip_ref.extractall("./")

downloaded_file = 'company_202112.zip'

Process ASIC Data into Dataframe

To allow further analysis, display the first 3 rows to get an understand of the data

data = []

with open(downloaded_file.upper().split(".")[0] + ".csv",encoding='ISO-8859-1') as csvfile:
    readCSV = csv.reader(csvfile, delimiter='\t')
    for index, row in enumerate(readCSV):
          if index < 1:
            continue
          dataRow = {}
          dataRow["companyName"]                                            = row[0].strip()
          dataRow['AustralianCompanyNumber']                                = row[1].strip()
          dataRow['type']                                                   = row[2].strip()
          dataRow['class']                                                  = row[3].strip()
          dataRow['subClass']                                               = row[4].strip()
          dataRow['status']                                                 = row[5].strip()
          dataRow['registrationDate']                                       = row[6].strip()
          dataRow['previousRegistrationState']                              = row[7].strip()
          dataRow['stateRegistrationNumber']                                = row[8].strip()
          dataRow['modifiedSinceLastReportFlag']                            = row[9].strip()
          dataRow['currentNameFlag']                                        = row[10].strip()
          dataRow['australianBusinessNumber']                               = row[11].strip()
          dataRow['currentName']                                            = row[12].strip()
          dataRow['currentNameStartDate']                                   = row[13].strip()
          data.append(dataRow)
#          if index > 100:
#              break
df = pd.DataFrame (data)

df['registrationDate'] = pd.to_datetime(df['registrationDate'])
df['companyNameClean'] = df['companyName'].map(lambda x: re.sub(  r"[^a-zA-Z0-9]+", ' ', x))

type_descriptions = {'APTY' : 'Australian proprietary company',
                     'APUB' : 'Australian public company',
                     'FNOS' : 'Foreign company (a company incorporated outside Australia but registered as a foreign company in Australia)',
                     'NONC' : 'Non-organisation (a body not registered under Corporations Act',
                     'RACN' : 'Registered Australian Corporation (an organisation incorporated in Australia under a law other than the Corporations Law but required to be registered under the Corporations Law).'
                          }

class_descriptions = {
    'LMSH'  : 'Limited by Shares',
    'LMGT'  : 'Limited by Guarantee',
    'LMSG'  : 'Limited by Both Shares and Guarantees',
    'NLIA'  : 'No Liability Company',
    'UNLM'  : 'Unlimited Company',
    'NONE'  : 'Does not have an equivalent Australian liability',
}                          

subClass_descriptions = {
'EXPT' : 'Exempt Proprietary Company',
'HUNT' : 'Proprietary home unit company',
'LISN' : 'Company licensed under Section 383 of the Corporations Act 2001',
'LISS' : 'Company licensed under Section 383 to omit Limitedfrom its name',
'LIST' : 'Listed public company',
'NEXT' : 'Non-Exempt Proprietary Company',
'NLTD' : 'Non-profit public company registered without limited in its name under Section 150',
'NONE' : 'Unknown',
'PNPC' : 'Proprietary non-profit company',
'PROP' : 'Proprietary other',
'PSTC' : 'Proprietary superannuation trustee company',
'PUBF' : 'Foreign company required to lodge a balance sheet',
'RACA' : 'Registrable Australian corporation – association',
'RACO' : 'Registrable Australian corporation - non association',
'STFI' : 'Public company – small transferring financial institution',
'ULSN' : 'Unlisted public - non-profit company',
'ULSS' : 'Unlisted public - superannuation trustee company',
'ULST' : 'Unlisted pubic company'
}

status_descriptions = {'DRGD' : 'De-registered',
                       'EXAD' : 'External administration (in receivership/liquidation)',
                       'NOAC' : 'Not active',
                       'NRGD' : 'Not registered',
                       'PROV' : 'Provisional',
                       'REGD' : 'Registered',
                       'SOFF' : 'Strike-off action in progress',
                       'DISS' : 'Dissolved by Special Act of Parliament',
                       'DIV3' : 'Organisation Transferred Registration via DIV3',
                       'PEND' : 'Pending - Schemes'
}

df = df.replace({"type": type_descriptions, "class" : class_descriptions, 'subClass': subClass_descriptions, 'status': status_descriptions}) 

df.head(3)

companyName AustralianCompanyNumber type class subClass status registrationDate previousRegistrationState stateRegistrationNumber modifiedSinceLastReportFlag currentNameFlag australianBusinessNumber currentName currentNameStartDate companyNameClean
0 TGR BIOSCIENCES PTY LTD 097258789 Australian proprietary company Limited by Shares Proprietary other Registered 2001-06-25 T1377600 Y 39097258789 TGR BIOSCIENCES PTY LTD
1 DELVADO AGENCIES PTY. LTD. 050461268 Australian proprietary company Limited by Shares Proprietary other Registered 1991-02-14 ACT R 90004182 31050461268 ILLUMINATED SOLUTIONS PTY LTD 15/07/1999 DELVADO AGENCIES PTY LTD
2 ILLUMINATED SOLUTIONS PTY LTD 050461268 Australian proprietary company Limited by Shares Proprietary other Registered 1991-02-14 ACT R 90004182 Y 31050461268 ILLUMINATED SOLUTIONS PTY LTD

Remove Historical Name Records

Basically just remove everything where the currentNameFlag does not equal 'Y'

original_df_row_count = len(df)

df = df[df['currentNameFlag'] == 'Y']

new_df_row_count = len(df)

print('Dataframe record count reduced from {} to {}, a reduction of {} records'.format(original_df_row_count, new_df_row_count, original_df_row_count - new_df_row_count))

Dataframe record count reduced from 3542273 to 3129507, a reduction of 412766 records

Lets Review the Rate of Company Creation over time

Broken down by status. Aligned to expections more companies are being registered over time. Of note is the very small number of companies being de-registered in 2020 (click the legend to drill isolate the chart to just de-registered).

df_recent= df.loc [
                    (df["registrationDate"] > '1980-01-01' )
                    ,
                    ['registrationDate', 'status']
                  ]
fig = px.histogram(df_recent, x=df_recent["registrationDate"].dt.year, color=df_recent['status'], labels={'x' :'Registration Year'}, title='Businesses Registered per Year')
fig.for_each_trace(lambda t: t.update(name=t.name.replace("status=", "")))
fig.show()

Lets take a closer look at the last couple of years

Future dated registrations have been removed from the graph. Its interesting to see that there is always a spike in registration just before the end of the financial year. Key point from both these histograms is that business registrations remained high during COVID however the part of de-registrations has significnatly changed.

df_recent= df.loc [
                    (df["registrationDate"] > '2017-01-01' ) &   (df["registrationDate"] < '2022-01-01' )
                    ,
                    ['registrationDate', 'status']
                  ]
fig = px.histogram(df_recent, x=df_recent["registrationDate"], color=df_recent['status'], title='Count of Businesses Registered per day')
fig.for_each_trace(lambda t: t.update(name=t.name.replace("status=", "")))
fig.show()

Create Simple World Cloud of Company Name Terms

Most common company names - excluding short and high frequency terms (e.g. pty, ltd, limited)

top_companyName_terms = pd.Series(' '.join(df['companyNameClean']).upper().split()).value_counts()[:700].reset_index()
top_companyName_terms.columns = ['term', 'frequencyCount']
top_companyName_terms =top_companyName_terms[(~top_companyName_terms['term'].isin(['LIMITED', 'LTD', 'PTY', 'QLD', 'VIC', 'HOLDINGS', 'AND', 'GROUP', 'THE', 'AUSTRALIA', 'NOMINEES', 'SERVICES'])) &
                                             (top_companyName_terms['term'].str.len() > 2 )
                                             ]    ## filter terms 

#[STOPWORDS.add(n) for n in ['Qld', 'Nsw', 'Vic', 'The', 'Aust', 'Holdings']]

wc = WordCloud(width = 800, 
                      height = 600, 
                      max_words = 10000,
                      random_state=1, 
                      background_color='salmon', 
                      colormap='Pastel1', 
                      collocations=False, 
                      stopwords = STOPWORDS).generate_from_frequencies(
                                               dict(zip(
                                                 top_companyName_terms['term'].str.title().tolist(), 
                                                 top_companyName_terms['frequencyCount'].tolist()
                                                 )
                                               )
                                             )


plt.figure(figsize=(30, 40))
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.show()

T