Setup Environment

Set ASIC Data URL and analysis libraries

import requests
import pytz
from     datetime   import date, datetime
import csv
import io
import os
import requests
import json
import random
import pandas as pd
import re
from wordcloud import WordCloud
from wordcloud import STOPWORDS
import matplotlib.pyplot as plt
import shutil
import plotly.express as px

## Set download URL
url = 'https://data.gov.au/data/dataset/7b8656f9-606d-4337-af29-66b89b2eeefb/resource/cb7e4eb5-ed46-4c6c-97a0-4532f4479b7d/download/company_202104.zip'

Download ASIC Data File

And Unzip the downloaded file

## Setup function to stream file to disk
def download_file(url):
    local_filename = url.split('/')[-1]
    with requests.get(url, stream=True) as r:
        with open(local_filename, 'wb') as f:
            shutil.copyfileobj(r.raw, f)

    return local_filename

downloaded_file = download_file(url)

import zipfile
with zipfile.ZipFile(downloaded_file,"r") as zip_ref:
    zip_ref.extractall("./")

downloaded_file = 'company_202112.zip'

Process ASIC Data into Dataframe

To allow further analysis, display the first 3 rows to get an understand of the data

data = []

with open(downloaded_file.upper().split(".")[0] + ".csv",encoding='ISO-8859-1') as csvfile:
    readCSV = csv.reader(csvfile, delimiter='\t')
    for index, row in enumerate(readCSV):
          if index < 1:
            continue
          dataRow = {}
          dataRow["companyName"]                                            = row[0].strip()
          dataRow['AustralianCompanyNumber']                                = row[1].strip()
          dataRow['type']                                                   = row[2].strip()
          dataRow['class']                                                  = row[3].strip()
          dataRow['subClass']                                               = row[4].strip()
          dataRow['status']                                                 = row[5].strip()
          dataRow['registrationDate']                                       = row[6].strip()
          dataRow['previousRegistrationState']                              = row[7].strip()
          dataRow['stateRegistrationNumber']                                = row[8].strip()
          dataRow['modifiedSinceLastReportFlag']                            = row[9].strip()
          dataRow['currentNameFlag']                                        = row[10].strip()
          dataRow['australianBusinessNumber']                               = row[11].strip()
          dataRow['currentName']                                            = row[12].strip()
          dataRow['currentNameStartDate']                                   = row[13].strip()
          data.append(dataRow)
#          if index > 100:
#              break
df = pd.DataFrame (data)

df['registrationDate'] = pd.to_datetime(df['registrationDate'])
df['companyNameClean'] = df['companyName'].map(lambda x: re.sub(  r"[^a-zA-Z0-9]+", ' ', x))

type_descriptions = {'APTY' : 'Australian proprietary company',
                     'APUB' : 'Australian public company',
                     'FNOS' : 'Foreign company (a company incorporated outside Australia but registered as a foreign company in Australia)',
                     'NONC' : 'Non-organisation (a body not registered under Corporations Act',
                     'RACN' : 'Registered Australian Corporation (an organisation incorporated in Australia under a law other than the Corporations Law but required to be registered under the Corporations Law).'
                          }

class_descriptions = {
    'LMSH'  : 'Limited by Shares',
    'LMGT'  : 'Limited by Guarantee',
    'LMSG'  : 'Limited by Both Shares and Guarantees',
    'NLIA'  : 'No Liability Company',
    'UNLM'  : 'Unlimited Company',
    'NONE'  : 'Does not have an equivalent Australian liability',
}                          

subClass_descriptions = {
'EXPT' : 'Exempt Proprietary Company',
'HUNT' : 'Proprietary home unit company',
'LISN' : 'Company licensed under Section 383 of the Corporations Act 2001',
'LISS' : 'Company licensed under Section 383 to omit Limitedfrom its name',
'LIST' : 'Listed public company',
'NEXT' : 'Non-Exempt Proprietary Company',
'NLTD' : 'Non-profit public company registered without limited in its name under Section 150',
'NONE' : 'Unknown',
'PNPC' : 'Proprietary non-profit company',
'PROP' : 'Proprietary other',
'PSTC' : 'Proprietary superannuation trustee company',
'PUBF' : 'Foreign company required to lodge a balance sheet',
'RACA' : 'Registrable Australian corporation – association',
'RACO' : 'Registrable Australian corporation - non association',
'STFI' : 'Public company – small transferring financial institution',
'ULSN' : 'Unlisted public - non-profit company',
'ULSS' : 'Unlisted public - superannuation trustee company',
'ULST' : 'Unlisted pubic company'
}

status_descriptions = {'DRGD' : 'De-registered',
                       'EXAD' : 'External administration (in receivership/liquidation)',
                       'NOAC' : 'Not active',
                       'NRGD' : 'Not registered',
                       'PROV' : 'Provisional',
                       'REGD' : 'Registered',
                       'SOFF' : 'Strike-off action in progress',
                       'DISS' : 'Dissolved by Special Act of Parliament',
                       'DIV3' : 'Organisation Transferred Registration via DIV3',
                       'PEND' : 'Pending - Schemes'
}

df = df.replace({"type": type_descriptions, "class" : class_descriptions, 'subClass': subClass_descriptions, 'status': status_descriptions}) 

df.head(3)

Remove Historical Name Records

Basically just remove everything where the currentNameFlag does not equal 'Y'

original_df_row_count = len(df)

df = df[df['currentNameFlag'] == 'Y']

new_df_row_count = len(df)

print('Dataframe record count reduced from {} to {}, a reduction of {} records'.format(original_df_row_count, new_df_row_count, original_df_row_count - new_df_row_count))

Dataframe record count reduced from 3542273 to 3129507, a reduction of 412766 records

Lets Review the Rate of Company Creation over time

Broken down by status. Aligned to expections more companies are being registered over time. Of note is the very small number of companies being de-registered in 2020 (click the legend to drill isolate the chart to just de-registered).

df_recent= df.loc [
                    (df["registrationDate"] > '1980-01-01' )
                    ,
                    ['registrationDate', 'status']
                  ]
fig = px.histogram(df_recent, x=df_recent["registrationDate"].dt.year, color=df_recent['status'], labels={'x' :'Registration Year'}, title='Businesses Registered per Year')
fig.for_each_trace(lambda t: t.update(name=t.name.replace("status=", "")))
fig.show()

Lets take a closer look at the last couple of years

Future dated registrations have been removed from the graph. Its interesting to see that there is always a spike in registration just before the end of the financial year. Key point from both these histograms is that business registrations remained high during COVID however the part of de-registrations has significnatly changed.

df_recent= df.loc [
                    (df["registrationDate"] > '2017-01-01' ) &   (df["registrationDate"] < '2022-01-01' )
                    ,
                    ['registrationDate', 'status']
                  ]
fig = px.histogram(df_recent, x=df_recent["registrationDate"], color=df_recent['status'], title='Count of Businesses Registered per day')
fig.for_each_trace(lambda t: t.update(name=t.name.replace("status=", "")))
fig.show()

Create Simple World Cloud of Company Name Terms

Most common company names - excluding short and high frequency terms (e.g. pty, ltd, limited)

top_companyName_terms = pd.Series(' '.join(df['companyNameClean']).upper().split()).value_counts()[:700].reset_index()
top_companyName_terms.columns = ['term', 'frequencyCount']
top_companyName_terms =top_companyName_terms[(~top_companyName_terms['term'].isin(['LIMITED', 'LTD', 'PTY', 'QLD', 'VIC', 'HOLDINGS', 'AND', 'GROUP', 'THE', 'AUSTRALIA', 'NOMINEES', 'SERVICES'])) &
                                             (top_companyName_terms['term'].str.len() > 2 )
                                             ]    ## filter terms 

#[STOPWORDS.add(n) for n in ['Qld', 'Nsw', 'Vic', 'The', 'Aust', 'Holdings']]

wc = WordCloud(width = 800, 
                      height = 600, 
                      max_words = 10000,
                      random_state=1, 
                      background_color='salmon', 
                      colormap='Pastel1', 
                      collocations=False, 
                      stopwords = STOPWORDS).generate_from_frequencies(
                                               dict(zip(
                                                 top_companyName_terms['term'].str.title().tolist(), 
                                                 top_companyName_terms['frequencyCount'].tolist()
                                                 )
                                               )
                                             )


plt.figure(figsize=(30, 40))
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.show()

T

	companyName	AustralianCompanyNumber	type	class	subClass	status	registrationDate	previousRegistrationState	stateRegistrationNumber	currentNameFlag	australianBusinessNumber	currentName	currentNameStartDate	companyNameClean
0	TGR BIOSCIENCES PTY LTD	097258789	Australian proprietary company	Limited by Shares	Proprietary other	Registered	2001-06-25		T1377600	Y	39097258789			TGR BIOSCIENCES PTY LTD
1	DELVADO AGENCIES PTY. LTD.	050461268	Australian proprietary company	Limited by Shares	Proprietary other	Registered	1991-02-14	ACT	R 90004182		31050461268	ILLUMINATED SOLUTIONS PTY LTD	15/07/1999	DELVADO AGENCIES PTY LTD
2	ILLUMINATED SOLUTIONS PTY LTD	050461268	Australian proprietary company	Limited by Shares	Proprietary other	Registered	1991-02-14	ACT	R 90004182	Y	31050461268			ILLUMINATED SOLUTIONS PTY LTD