(Linear Regression) Prediction Meblourne Weather via Histroical Data

This funny project is only for study and try because of a similar prediction project(in production) which I engaged in.

It’s time series based prediction so I won’t discuss the accuracy(because the accuracy of linear regression for time series prediction is very good, otherwise random data or less sample data) and EDA.

This application will run daily on Google Cloud.

Data source

Australian Government - Bureau of Meteorology

There are verious of weather data

We need to download the csv file.

However, we need historical data as more as possible, more sample(recently) would imporve the accuracy.
Lucky, Australian Bureau of Meteorology provide the last 14 months data.

Prepare Google Cloud Storage

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]=os.path.abspath("./prediction-melb-8c16f470aeda.json")
os.environ["GCLOUD_PROJECT"]="prediction-melb"

storage_client = storage.Client()

buckets = list(storage_client.list_buckets())

bucket = storage_client.get_bucket("bomdata")

def isFileExistInBukect(file):
return storage.Blob(bucket=storage_client.bucket("bomdata"), name=file).exists(storage_client)

def uploadFileToBukect(filename):
blob = bucket.blob(filename)
blob.upload_from_filename(os.path.abspath(filename))

Download data

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
def downloadMelbWeatherData():
#from http://www.bom.gov.au/climate/dwo/202005/html/IDCJDW3050.202005.shtml
now = arrow.now()
n = now.shift(days=-1).format("YYYYMM")
print("downloading yesterday dataset "+n+".csv")
url = "http://www.bom.gov.au/climate/dwo/"+n+"/text/IDCJDW3050."+n+".csv"
os.system("wget -O {0} {1}".format(n+".csv", url))
uploadFileToBukect(n+'.csv')

for i in range(14):
m = now.shift(months=-i).format("YYYYMM")
url = "http://www.bom.gov.au/climate/dwo/"+m+"/text/IDCJDW3050."+m+".csv"

if isFileExistInBukect(m+".csv") == False:
print("downloading "+url)
os.system("wget -O {0} {1}".format(m+".csv", url))
uploadFileToBukect(m+'.csv')
os.system("rm {0} {1}".format(m+".csv"))

Combine all csv files to one

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
def combineDataset():
fs = gcsfs.GCSFileSystem(project='prediction-melb',token=json.load(open('prediction-melb-8c16f470aeda.json')))
boms = fs.ls('bomdata')
with open("all.csv", "w") as f2:
f2.write('"Non","Date","MiniTemp","MaxTemp","Rainfall","Evaporation (mm)","Sunshine","Direction of maximum wind gust ","Speed of maximum wind gust (km/h)","Time of maximum wind gust","9am Temperature (ḞC)","9am relative humidity (%)","9am cloud amount (oktas)","9am wind direction","9am wind speed (km/h)","9am MSL pressure (hPa)","3pm Temperature (ḞC)","3pm relative humidity (%)","3pm cloud amount (oktas)","3pm wind direction","3pm wind speed (km/h)","3pm MSL pressure (hPa)"\n')
for bom in boms:
lineNum = 0
with fs.open(bom) as f:
lines = f.readlines()
for line in lines:
lineNum = lineNum+1;
if lineNum > 9:
#print(line)
f2.write(str(line).replace("b'","").replace("\\r\\n'","\n"))
f2.write("\n")
lineNum = 0

Convert data

since we need to set time series. Using day 1 to N data to prediction the M day.

M = N + n, n as you wish. I set M = N + 1.

1
2
3
4
5
6
7
def convertTTData(array,inputDayNum):
x = []
y = []
for i in range(len(array) - inputDayNum - 1):
x.append(array[i:inputDayNum+i])
y.append(array[inputDayNum+i])
return np.array(x).reshape(-1,inputDayNum),np.array(y)

Prepare latest data

Prepare latest data for prediction

1
2
3
# latest weatcher data for prediction
def getLatestX(array,inputDayNum):
return np.array(array[len(array)-inputDayNum:])

Train and prediction

1
2
3
4
5
6
7
8
def predict(array,inputinputDayNumNum):
x,y = convertTTData(array,inputDayNum)
px = getLatestX(array,inputDayNum)
regressor = LinearRegression()
regressor.fit(x, y) #train first
py = regressor.predict([px]) # predict
return py[0]

Save Data

Save prdiction result to Google Cloud Storage

Predicting the Rainfall, max/min temperature, SunShine hours

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
db = firestore.Client()

def predictAndPushData():
combineDataset()
df = pd.read_csv("all.csv")
del df["Non"]
df = df.fillna(method='ffill')
df

now = arrow.now()
inputDayNum = 7

doc_ref = db.collection(u'predicted').document(now.format("YYYYMMDD"))
doc_ref.set({
u'Rainfall': predict(df["Rainfall"],7),
u'MiniTemp': predict(df["MiniTemp"],7),
u'MaxTemp': predict(df["MaxTemp"],7),
u'Sunshine': predict(df["Sunshine"],7),
})

A Bit Snap of Prediction(Notebook)

Source Code

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import pandas as pd
import json
import arrow
from google.cloud import firestore
from google.cloud import storage
import os
import requests
import gcsfs
import json
import numpy as np
from sklearn.linear_model import LinearRegression
import schedule
import time

os.environ["GOOGLE_APPLICATION_CREDENTIALS"]=os.path.abspath("./prediction-melb-8c16f470aeda.json")
os.environ["GCLOUD_PROJECT"]="prediction-melb"

storage_client = storage.Client()
db = firestore.Client()

buckets = list(storage_client.list_buckets())
bucket = storage_client.get_bucket("bomdata")

def isFileExistInBukect(file):
return storage.Blob(bucket=storage_client.bucket("bomdata"), name=file).exists(storage_client)

def uploadFileToBukect(filename):
blob = bucket.blob(filename)
blob.upload_from_filename(os.path.abspath(filename))

def downloadMelbWeatherData():
#from http://www.bom.gov.au/climate/dwo/202005/html/IDCJDW3050.202005.shtml
now = arrow.now()
n = now.format("YYYYMM")
print("downloading yesterday dataset "+n+".csv")
url = "http://www.bom.gov.au/climate/dwo/"+n+"/text/IDCJDW3050."+n+".csv"
os.system("wget -O {0} {1}".format(n+".csv", url))
uploadFileToBukect(n+'.csv')

for i in range(14):
m = now.shift(months=-i).format("YYYYMM")
url = "http://www.bom.gov.au/climate/dwo/"+m+"/text/IDCJDW3050."+m+".csv"

if isFileExistInBukect(m+".csv") == False:
print("downloading "+url)
os.system("wget -O {0} {1}".format(m+".csv", url))
uploadFileToBukect(m+'.csv')
os.system("rm {0} {1}".format(m+".csv"))


def combineDataset():
fs = gcsfs.GCSFileSystem(project='prediction-melb',token=json.load(open('prediction-melb-8c16f470aeda.json')))
boms = fs.ls('bomdata')
with open("all.csv", "w") as f2:
f2.write('"Non","Date","MiniTemp","MaxTemp","Rainfall","Evaporation (mm)","Sunshine","Direction of maximum wind gust ","Speed of maximum wind gust (km/h)","Time of maximum wind gust","9am Temperature (ḞC)","9am relative humidity (%)","9am cloud amount (oktas)","9am wind direction","9am wind speed (km/h)","9am MSL pressure (hPa)","3pm Temperature (ḞC)","3pm relative humidity (%)","3pm cloud amount (oktas)","3pm wind direction","3pm wind speed (km/h)","3pm MSL pressure (hPa)"\n')
for bom in boms:
lineNum = 0
with fs.open(bom) as f:
lines = f.readlines()
for line in lines:
lineNum = lineNum+1;
if lineNum > 9:
#print(line)
f2.write(str(line).replace("b'","").replace("\\r\\n'","\n"))
f2.write("\n")
lineNum = 0

def convertTTData(array,inputDayNum):
x = []
y = []
for i in range(len(array) - inputDayNum - 1):
x.append(array[i:inputDayNum+i])
y.append(array[inputDayNum+i])
return np.array(x).reshape(-1,inputDayNum),np.array(y)

def getLatetX(array,inputDayNum):
return np.array(array[len(array)-inputDayNum:])

def predict(array,inputDayNum):
x,y = convertTTData(array,inputDayNum)
px = getLatetX(array,inputDayNum)
regressor = LinearRegression()
regressor.fit(x, y)
py = regressor.predict([px])
return py[0]

def predictAndPushData():
downloadMelbWeatherData()
combineDataset()

df = pd.read_csv("all.csv")
del df["Non"]
df = df.fillna(method='ffill')
df

now = arrow.now()
inputDayNum = 15

doc_ref = db.collection(u'predicted').document(now.format("YYYYMMDD"))
doc_ref.set({
u'Rainfall': predict(df["Rainfall"],inputDayNum),
u'MiniTemp': predict(df["MiniTemp"],inputDayNum),
u'MaxTemp': predict(df["MaxTemp"],inputDayNum),
u'Sunshine': predict(df["Sunshine"],inputDayNum),
})

schedule.every().day.at("00:30").do(predictAndPushData)

while 1:
schedule.run_pending()
time.sleep(60*5)

THE GOOGLE CLOUD PROJECT HAS BEEN REMOVED ON 4/6/2020