ML-using-Python/01 01 Split train test long script.py at main · Pablo-source/ML-using-Python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#!/usr/bin/env python
# coding: utf-8
# # Adhoc functions
# This workbook includes several adhoc functions to be used on different scripts

import pandas as pd
import os
import matplotlib.pyplot as plt
import math

path = os.getcwd()
print(path)

# ### 1. Split initial data into Train and Test sets
# We have designed a function to split initial input data frame into Train and Test sets.
# The function has two parameters:
# - The data set you want to split defined by the "Inputdata" object
# - The percentage you want to assign to the Train split (usually 80%)

os.chdir('C:/Pablo UK/46 DATA SCIENCE all/10 ML python JULY 2021')
# Read in dataset

Sales = pd.read_csv('Sales_to_split_TRAIN_TEST.csv')
Sales.head()

# Function to split input data into train test sets
def traintestsplitn(Inputdata, perctrain):
    """Split mean data set into train and test tests,
    TRAIN percent defined by user"""

    train_ceil = math.ceil(((len(Inputdata.index) * perctrain) / 100))
    train = Inputdata.iloc[0:train_ceil, ]

    train_output = pd.DataFrame(train)
    train_output.to_csv('TRAIN.csv', index=False)

    test = Inputdata.iloc[train_ceil:, ]
    test_output = pd.DataFrame(test)
    test_output.to_csv('TEST.csv', index=False)
    return train_output, test_output

# Use above function
traintestsplitn(Sales, 80)

# Write output to csv files
train = pd.read_csv('TRAIN.csv')
train.head()
test = pd.read_csv('TEST.csv')
test.head()

# Check data type
print(isinstance(train, pd.DataFrame))
print(isinstance(test, pd.DataFrame))

# Check original file length matches TRAIN and TEST split
len(Sales)
len(train)
len(test)

len(train) + len(test)

# Function to check for the right length
def checksplit(main, train, test):
    """The length of input data set must match the sum of
    train and test sets length"""
    if len(main) == len(train) + len(test):
        print('The split has been successful')
    elif len(main) != len(train) + len(test):
        print('Try again')

# Use above function
checksplit(Sales, train, test)

# ## 3. Plot train and test series
# ** Important, set DATE as Index for both TRAIN and TEST sets**
train_plot = train.set_index('DATE')
train_plot.head()

test_plot = test.set_index('DATE')
test_plot.head()

plt.figure(figsize=(40, 20))
plt.title('Total sales')
plt.plot(train_plot, label="Train data")
plt.plot(test_plot, label="Test data")
plt.legend()
plt.ylabel("value")
plt.xlabel("years")
plt.tick_params(axis='x', which='major', labelsize=3)
plt.show()

# ## 4. Check distribution of each data sets
# #### 4.1  Train set
plt.figure(figsize=(20, 10))
plt.title('Total sales')
plt.plot(train_plot, label="Train data")
plt.legend()
plt.ylabel("value")
plt.xlabel("years")
plt.tick_params(axis='x', which='major', labelsize=3)
plt.show()

# #### 4.2  Test set
plt.figure(figsize=(20, 10))
plt.title('Total sales')
plt.plot(test_plot, label="Test data")
plt.legend()
plt.ylabel("value")
plt.xlabel("years")
plt.tick_params(axis='x', which='major', labelsize=3)
plt.show()