# ADAPTED FROM https://scikit-learn.org/stable/auto_examples/tree/plot_tree_regression 
# sara@ceresAnalytics.com
# 617-519-3151

#I. Setup -- requires pre-installed libraries numpy, sklearn, matplotlib, and seaborn 
import numpy as np
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import plot_tree

# A. SET A RANDOM STATE AND SEE WHAT IT IS... 
#    YOU WILL SEE "RandomState(MT19937)", which  means you are using a random number generator 
#    based on the Mersenne Twister algorithm
rng = np.random.RandomState(1)
print("rng is",rng)

#B. CREATE A RANGE OF RANDOM X VALUES FOR YOUR HORIZONAL AXIS
#   THIS WILL PRINT AS AN 80-ROW BY 1-COLUMN MATRIX... 
#   IT IS ESTABLISHED AS A 2D ARRAY, WHICH IS THE CONFIGURATION YOU WOULD WANT IF YOU HAD MULTIPLE X's IN "REAL DATA"
#   HERE, THOUGH, ITS SINGLE COLUMN WILL BE USED IN PART C BELOW TO GENERATE Y AS A SINE
X = np.sort(5 * rng.rand(80, 1), axis=0)
print("X is\n",X)

#C. CREATE Y VALUES USING THE SINE FUNCTION, THEN ASSURE A FLAT 1D ARRAY (VECTOR) WITH ravel()
#   IT WILL WRAP WHILE PRINTING HORIZONTALLY, AS PYTHON KNOWS IT IS A VECTOR 
y = np.sin(X).ravel()
#   1. ADD SOME NOISE TO YOUR Y VECTOR -- 16 RANDOM VALUES 
y[::5] += 3 * (0.5 - rng.rand(16))
print ("y is\n", y)

#D. CHECK THE DIMENSIONS OF WHAT YOU CREATED
print("X dims",X.shape)
print("y dims",y.shape)


#II. MAKE TWO TREES: THE ONLY STOPPING CRITERION YOU WILL USE IS THE TREE'S DEPTH (NUMBER OF LAYERS 
#A. create, fit and plot the 2-layer tree 
#   1. create
regr_1 = DecisionTreeRegressor(max_depth=2)
#   2. fit
regr_1.fit(X, y)
#   3. plot in file named "sine_tree_2_layers.png", same folder as this program 
plt.figure(figsize=(18, 10))
plot_tree(
    regr_1,
    feature_names=["X"],
    filled=True,
    rounded=True,
    fontsize=16,
    precision=1
)
plt.savefig("sine_tree_2_layers.png")

#B. create, fit and plot the 5-layer tree 
regr_2 = DecisionTreeRegressor(max_depth=5)
#   2. fit
regr_2.fit(X, y)
#   3. plot in file named "sine_tree_5_layers.png", same folder as this program 
plt.figure(figsize=(18, 10))
plot_tree(
    regr_2,
    feature_names=["X"],
    filled=True,
    rounded=True,
    fontsize=16,
    precision=1
)
plt.savefig("sine_tree_5_layers.png")


#III. MAKE UP "TEST" DATA... THEY WILL BE A SORTED SERIES FROM 0 to 4.99 IN 0.01 INCREMENTS
#     TO LIMIT OUTPUT, WE'LL USE ravel() to flatten it
X_test = np.arange(0.0, 5.0, 0.01)[:, np.newaxis]
print(X_test.ravel())

#    A. APPLY YOUR 2-LAYER TREE TO PREDICT FITTED VALUES FOR X_test 
y_1 = regr_1.predict(X_test)

#    B. APPLY YOUR 5-LAYER TREE TO PREDICT FITTED VALUES FOR X_test 
y_2 = regr_2.predict(X_test)

#IV. MORE PLOTS 
#A. FIRST THE X-Y DATA ON WHICH YOU TRAINED YOUR TREES: call this "sine_train_data.png"
plt.figure()
plt.scatter(X, y, s=20, edgecolor="black", c="darkorange", label="data")
plt.xlabel("data")
plt.ylabel("target")
plt.title("One X to break up for Tree segments")
plt.legend()
plt.savefig("sine_train_data.png")

#B. NEXT ADD THE PREDICTIONS FROM THE 2-LAYER TREE: call this "sine_test_preds_from_2layers.png"
plt.figure()
plt.scatter(X, y, s=20, edgecolor="black", c="darkorange", label="data")
plt.xlabel("data")
plt.ylabel("target")
plt.title("2-Layer Tree Predictions")
plt.plot(X_test, y_1, color="cornflowerblue", label="max_depth=2", linewidth=2)
plt.legend()
plt.savefig("sine_test_preds_from_2layers.png")

#C. NOW THE PREDICTIONS FROM THE 5-LAYER TREE: call this "sine_test_preds_from_5layers.png"
plt.figure()
plt.scatter(X, y, s=20, edgecolor="black", c="darkorange", label="data")
#plt.plot(X_test, y_1, color="cornflowerblue", label="max_depth=2", linewidth=2)
plt.plot(X_test, y_2, color="yellowgreen", label="max_depth=5", linewidth=2)
plt.xlabel("data")
plt.ylabel("target")
plt.title("5-Layer Tree Predictions")
plt.legend()
plt.savefig("sine_test_preds_from_5layers.png")