(Linear Regression) Prediction Meblourne Weather via Histroical Data

David(Zhen Zhang)

2020-05-30

This funny project is only for study and try because of a similar prediction project(in production) which I engaged in.

It’s time series based prediction so I won’t discuss the accuracy(because the accuracy of linear regression for time series prediction is very good, otherwise random data or less sample data) and EDA.

This application will run daily on Google Cloud.

Data source

Australian Government - Bureau of Meteorology

There are verious of weather data

We need to download the csv file.

However, we need historical data as more as possible, more sample(recently) would imporve the accuracy.
Lucky, Australian Bureau of Meteorology provide the last 14 months data.

Prepare Google Cloud Storage

os.environ["GOOGLE_APPLICATION_CREDENTIALS"]=os.path.abspath("./prediction-melb-8c16f470aeda.json")
os.environ["GCLOUD_PROJECT"]="prediction-melb"

storage_client = storage.Client()

buckets = list(storage_client.list_buckets())

bucket = storage_client.get_bucket("bomdata")

def isFileExistInBukect(file):
    return storage.Blob(bucket=storage_client.bucket("bomdata"), name=file).exists(storage_client)

def uploadFileToBukect(filename):
    blob = bucket.blob(filename)
    blob.upload_from_filename(os.path.abspath(filename))

Download data

def downloadMelbWeatherData():
    #from http://www.bom.gov.au/climate/dwo/202005/html/IDCJDW3050.202005.shtml
    now = arrow.now()
    n = now.shift(days=-1).format("YYYYMM")
    print("downloading yesterday dataset "+n+".csv")
    url = "http://www.bom.gov.au/climate/dwo/"+n+"/text/IDCJDW3050."+n+".csv"
    os.system("wget -O {0} {1}".format(n+".csv", url))
    uploadFileToBukect(n+'.csv')
    
    for i in range(14):
        m = now.shift(months=-i).format("YYYYMM")
        url = "http://www.bom.gov.au/climate/dwo/"+m+"/text/IDCJDW3050."+m+".csv"
        
        if isFileExistInBukect(m+".csv") == False: 
            print("downloading "+url)
            os.system("wget -O {0} {1}".format(m+".csv", url))
            uploadFileToBukect(m+'.csv')
            os.system("rm {0} {1}".format(m+".csv"))

Combine all csv files to one

def combineDataset():
    fs = gcsfs.GCSFileSystem(project='prediction-melb',token=json.load(open('prediction-melb-8c16f470aeda.json')))
    boms = fs.ls('bomdata')
    with open("all.csv", "w") as f2:
        f2.write('"Non","Date","MiniTemp","MaxTemp","Rainfall","Evaporation (mm)","Sunshine","Direction of maximum wind gust ","Speed of maximum wind gust (km/h)","Time of maximum wind gust","9am Temperature (ḞC)","9am relative humidity (%)","9am cloud amount (oktas)","9am wind direction","9am wind speed (km/h)","9am MSL pressure (hPa)","3pm Temperature (ḞC)","3pm relative humidity (%)","3pm cloud amount (oktas)","3pm wind direction","3pm wind speed (km/h)","3pm MSL pressure (hPa)"\n')
        for bom in boms:
            lineNum = 0
            with fs.open(bom) as f:
                lines = f.readlines()
            for line in lines:
                lineNum = lineNum+1;
                if lineNum > 9:
                    #print(line)
                    f2.write(str(line).replace("b'","").replace("\\r\\n'","\n"))
            f2.write("\n")
            lineNum = 0

Convert data

since we need to set time series. Using day 1 to N data to prediction the M day.

M = N + n, n as you wish. I set M = N + 1.

def convertTTData(array,inputDayNum):
    x = []
    y = []
    for i in range(len(array) - inputDayNum - 1):
        x.append(array[i:inputDayNum+i])
        y.append(array[inputDayNum+i])
    return np.array(x).reshape(-1,inputDayNum),np.array(y)

Prepare latest data

Prepare latest data for prediction

1
2
3

# latest weatcher data for prediction
def getLatestX(array,inputDayNum): 
    return np.array(array[len(array)-inputDayNum:])

Train and prediction

def predict(array,inputinputDayNumNum):
    x,y = convertTTData(array,inputDayNum)
    px = getLatestX(array,inputDayNum)
    regressor = LinearRegression()
    regressor.fit(x, y)  #train first
    py = regressor.predict([px]) # predict 
    return py[0]

Save Data

Save prdiction result to Google Cloud Storage

Predicting the Rainfall, max/min temperature, SunShine hours

db = firestore.Client()

def predictAndPushData():
    combineDataset()
    df = pd.read_csv("all.csv")
    del df["Non"]
    df = df.fillna(method='ffill')
    df
    
    now = arrow.now()
    inputDayNum = 7
    
    doc_ref = db.collection(u'predicted').document(now.format("YYYYMMDD"))
    doc_ref.set({
        u'Rainfall': predict(df["Rainfall"],7),
        u'MiniTemp': predict(df["MiniTemp"],7),
        u'MaxTemp': predict(df["MaxTemp"],7),
        u'Sunshine': predict(df["Sunshine"],7),
    })

A Bit Snap of Prediction(Notebook)

Source Code

import pandas as pd
import json
import arrow
from google.cloud import firestore
from google.cloud import storage
import os
import requests
import gcsfs
import json
import numpy as np
from sklearn.linear_model import LinearRegression
import schedule
import time

os.environ["GOOGLE_APPLICATION_CREDENTIALS"]=os.path.abspath("./prediction-melb-8c16f470aeda.json")
os.environ["GCLOUD_PROJECT"]="prediction-melb"

storage_client = storage.Client()
db = firestore.Client()

buckets = list(storage_client.list_buckets())
bucket = storage_client.get_bucket("bomdata")

def isFileExistInBukect(file):
    return storage.Blob(bucket=storage_client.bucket("bomdata"), name=file).exists(storage_client)

def uploadFileToBukect(filename):
    blob = bucket.blob(filename)
    blob.upload_from_filename(os.path.abspath(filename))
    
def downloadMelbWeatherData():
    #from http://www.bom.gov.au/climate/dwo/202005/html/IDCJDW3050.202005.shtml
    now = arrow.now()
    n = now.format("YYYYMM")
    print("downloading yesterday dataset "+n+".csv")
    url = "http://www.bom.gov.au/climate/dwo/"+n+"/text/IDCJDW3050."+n+".csv"
    os.system("wget -O {0} {1}".format(n+".csv", url))
    uploadFileToBukect(n+'.csv')
    
    for i in range(14):
        m = now.shift(months=-i).format("YYYYMM")
        url = "http://www.bom.gov.au/climate/dwo/"+m+"/text/IDCJDW3050."+m+".csv"
        
        if isFileExistInBukect(m+".csv") == False: 
            print("downloading "+url)
            os.system("wget -O {0} {1}".format(m+".csv", url))
            uploadFileToBukect(m+'.csv')
            os.system("rm {0} {1}".format(m+".csv"))

            
def combineDataset():
    fs = gcsfs.GCSFileSystem(project='prediction-melb',token=json.load(open('prediction-melb-8c16f470aeda.json')))
    boms = fs.ls('bomdata')
    with open("all.csv", "w") as f2:
        f2.write('"Non","Date","MiniTemp","MaxTemp","Rainfall","Evaporation (mm)","Sunshine","Direction of maximum wind gust ","Speed of maximum wind gust (km/h)","Time of maximum wind gust","9am Temperature (ḞC)","9am relative humidity (%)","9am cloud amount (oktas)","9am wind direction","9am wind speed (km/h)","9am MSL pressure (hPa)","3pm Temperature (ḞC)","3pm relative humidity (%)","3pm cloud amount (oktas)","3pm wind direction","3pm wind speed (km/h)","3pm MSL pressure (hPa)"\n')
        for bom in boms:
            lineNum = 0
            with fs.open(bom) as f:
                lines = f.readlines()
            for line in lines:
                lineNum = lineNum+1;
                if lineNum > 9:
                    #print(line)
                    f2.write(str(line).replace("b'","").replace("\\r\\n'","\n"))
            f2.write("\n")
            lineNum = 0
            
def convertTTData(array,inputDayNum):
    x = []
    y = []
    for i in range(len(array) - inputDayNum - 1):
        x.append(array[i:inputDayNum+i])
        y.append(array[inputDayNum+i])
    return np.array(x).reshape(-1,inputDayNum),np.array(y)

def getLatetX(array,inputDayNum):
    return np.array(array[len(array)-inputDayNum:])

def predict(array,inputDayNum):
    x,y = convertTTData(array,inputDayNum)
    px = getLatetX(array,inputDayNum)
    regressor = LinearRegression()
    regressor.fit(x, y)
    py = regressor.predict([px])
    return py[0]

def predictAndPushData():
    downloadMelbWeatherData()
    combineDataset()

    df = pd.read_csv("all.csv")
    del df["Non"]
    df = df.fillna(method='ffill')
    df
    
    now = arrow.now()
    inputDayNum = 15
    
    doc_ref = db.collection(u'predicted').document(now.format("YYYYMMDD"))
    doc_ref.set({
        u'Rainfall': predict(df["Rainfall"],inputDayNum),
        u'MiniTemp': predict(df["MiniTemp"],inputDayNum),
        u'MaxTemp': predict(df["MaxTemp"],inputDayNum),
        u'Sunshine': predict(df["Sunshine"],inputDayNum),
    })

schedule.every().day.at("00:30").do(predictAndPushData)

while 1:
    schedule.run_pending()
    time.sleep(60*5)

THE GOOGLE CLOUD PROJECT HAS BEEN REMOVED ON 4/6/2020