684 lines
143 KiB
Plaintext
684 lines
143 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"id": "7fb27b941602401d91542211134fc71a",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import numpy as np\n",
|
|
"import pandas as pd\n",
|
|
"import seaborn as sns\n",
|
|
"import matplotlib.pyplot as plt\n",
|
|
"from sklearn.preprocessing import StandardScaler\n",
|
|
"from sklearn.model_selection import train_test_split, cross_val_score, ShuffleSplit\n",
|
|
"from sklearn.metrics import accuracy_score, mean_absolute_error\n",
|
|
"from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression\n",
|
|
"from sklearn.svm import LinearSVC\n",
|
|
"from sklearn.tree import (\n",
|
|
" DecisionTreeClassifier,\n",
|
|
" ExtraTreeClassifier,\n",
|
|
" DecisionTreeRegressor,\n",
|
|
")\n",
|
|
"from sklearn.ensemble import (\n",
|
|
" RandomForestClassifier,\n",
|
|
" AdaBoostClassifier,\n",
|
|
" BaggingClassifier,\n",
|
|
" ExtraTreesClassifier,\n",
|
|
" AdaBoostRegressor,\n",
|
|
")\n",
|
|
"from sklearn.decomposition import PCA, KernelPCA\n",
|
|
"from sklearn.cluster import KMeans, AgglomerativeClustering\n",
|
|
"from scipy.cluster.hierarchy import dendrogram, linkage\n",
|
|
"from sklearn.feature_selection import SelectKBest, SelectFromModel, f_classif\n",
|
|
"# from mlxtend.plotting import plot_decision_regions"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"id": "acae54e37e7d407bbb7b55eff062a284",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# import warnings filter\n",
|
|
"from warnings import simplefilter\n",
|
|
"\n",
|
|
"# ignore all future warnings\n",
|
|
"simplefilter(action=\"ignore\", category=FutureWarning)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "9a63283cbaf04dbcab1f6479b197f3a8",
|
|
"metadata": {},
|
|
"source": [
|
|
"# DataSet"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"id": "8dd0d8092fe74a7c96281538738b07e2",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"(33, 32)\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"df = pd.DataFrame(\n",
|
|
" pd.read_csv(\n",
|
|
" \"../top_Gads_updated.dat\",\n",
|
|
" header=None,\n",
|
|
" comment=\"#\",\n",
|
|
" sep=\"\\s+\",\n",
|
|
" squeeze=True,\n",
|
|
" names=[\"material\", \"adsorbate\", \"DF_lower\", \"DF_upper\", \"DF\"],\n",
|
|
" )\n",
|
|
")\n",
|
|
"\n",
|
|
"\n",
|
|
"df = df[[\"material\", \"adsorbate\", \"DF\"]]\n",
|
|
"aminoacid = df[\"adsorbate\"].unique()\n",
|
|
"\n",
|
|
"n = {}\n",
|
|
"for i in aminoacid:\n",
|
|
" n[i] = list(df[df.adsorbate == i][\"DF\"])\n",
|
|
"X = pd.DataFrame(n)\n",
|
|
"\n",
|
|
"X.head(5)\n",
|
|
"print(X.shape)\n",
|
|
"aminoacids_label = list(df[\"adsorbate\"].unique())\n",
|
|
"material_label = list(df[\"material\"].unique())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "72eea5119410473aa328ad9291626812",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Data Preparation"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"id": "8edb47106e1a46a883d545849b8ab81b",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"X.rename(columns={\"C3H6\": \"PRP\", \"C4H6\": \"BUT-2\", \"C4H8\": \"BUT-1\"}, inplace=True)\n",
|
|
"X.rename(\n",
|
|
" index={\n",
|
|
" \"C_amorph-1\": \"C-AM-1\",\n",
|
|
" \"C_amorph-2\": \"C-AM-2\",\n",
|
|
" \"C_amorph-3\": \"C-AM-3\",\n",
|
|
" \"CNT15-COO--10\": \"CNT-COO$^{-}$-high\",\n",
|
|
" \"CNT15-COO--3\": \"CNT-COO$^{-}$-low\",\n",
|
|
" \"CNT15-COOH-30\": \"CNT-COOH-high\",\n",
|
|
" \"CNT15-COOH-3\": \"CNT-COOH-low\",\n",
|
|
" \"CNT15-NH2-14\": \"CNT-NH$_{2}$-high\",\n",
|
|
" \"CNT15-NH2-2\": \"CNT-NH$_{2}$-low\",\n",
|
|
" \"CNT15-NH3+-4\": \"CNT-NH$_{3}^{+}$-high\",\n",
|
|
" \"CNT15-NH3+-2\": \"CNT-NH$^{+}_{3}$-low\",\n",
|
|
" \"CNT15-OH-14\": \"CNT-OH-high\",\n",
|
|
" \"CNT15-OH-4\": \"CNT-OH-low\",\n",
|
|
" \"CNT15\": \"CNT\",\n",
|
|
" \"Fe2O3-001O\": \"Fe$_{2}$O$_{3}$(001)\",\n",
|
|
" \"graphene\": \"GR\",\n",
|
|
" \"bi-graphene\": \"bi-GR\",\n",
|
|
" \"tri-graphene\": \"tri-GR\",\n",
|
|
" \"grapheneoxide\": \"GO\",\n",
|
|
" \"redgrapheneoxide\": \"rGO\",\n",
|
|
" \"SiO2-Q2\": \"SiO$_{2}$-Q2\",\n",
|
|
" \"SiO2-Q4\": \"SiO$_{2}$-Q4\",\n",
|
|
" \"TiO2-rut-110\": \"TiO$_{2}$-rut(110)\",\n",
|
|
" \"TiO2-ana-101\": \"TiO$_{2}$-ana(101)\",\n",
|
|
" \"TiO2-rut-100\": \"TiO$_{2}$-rut(100)\",\n",
|
|
" \"TiO2-ana-100\": \"TiO$_{2}$-ana(100)\",\n",
|
|
" \"TiO2-ana-101-NB\": \"TiO$_{2}$-ana(101)-NB\",\n",
|
|
" \"ZnO-1010\": \"ZnO(10$\\overline{1}}$0)\",\n",
|
|
" \"ZnO-1210\": \"ZnO(1$\\overline{2}}$10)\",\n",
|
|
" \"ZnS-110\": \"ZnS(110)\",\n",
|
|
" \"ZnS-110-coated\": \"ZnS(110)-coated\",\n",
|
|
" },\n",
|
|
" inplace=True,\n",
|
|
")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"id": "10185d26023b46108eb7d9f57d49d2b3",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"main = [\"ASP\", \"VAL\", \"PRO\"]\n",
|
|
"aminacid_order = [\n",
|
|
" \"ALA\",\n",
|
|
" \"ARG\",\n",
|
|
" \"ASN\",\n",
|
|
" \"ASP\",\n",
|
|
" \"CYS\",\n",
|
|
" \"CYM\",\n",
|
|
" \"GLN\",\n",
|
|
" \"GAN\",\n",
|
|
" \"GLU\",\n",
|
|
" \"HID\",\n",
|
|
" \"HIE\",\n",
|
|
" \"HIP\",\n",
|
|
" \"ILE\",\n",
|
|
" \"LEU\",\n",
|
|
" \"LYS\",\n",
|
|
" \"MET\",\n",
|
|
" \"PHE\",\n",
|
|
" \"SER\",\n",
|
|
" \"THR\",\n",
|
|
" \"TRP\",\n",
|
|
" \"TYR\",\n",
|
|
" \"VAL\",\n",
|
|
" \"GLY\",\n",
|
|
" \"PRO\",\n",
|
|
" \"CHL\",\n",
|
|
" \"PHO\",\n",
|
|
" \"ETA\",\n",
|
|
" \"EST\",\n",
|
|
" \"PRP\",\n",
|
|
" \"BUT-1\",\n",
|
|
" \"BUT-2\",\n",
|
|
" \"DGL\",\n",
|
|
"]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "8763a12b2bbd4a93a75aff182afb95dc",
|
|
"metadata": {},
|
|
"source": [
|
|
"# AdaBoostRegressor modelling using 50 DecisionTreeRegressor(max_depth=5), test_size=0.3 and 10 ShuffleSplit"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"id": "7623eae2785240b9bd12b16a66d81610",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def AminoAcid_LR(i):\n",
|
|
" X_LR = X[main]\n",
|
|
" Y = X[i]\n",
|
|
" rs = ShuffleSplit(n_splits=10, test_size=0.3, random_state=0)\n",
|
|
" train_score = []\n",
|
|
" test_score = []\n",
|
|
" train_MAE = []\n",
|
|
" test_MAE = []\n",
|
|
" for train_index, test_index in rs.split(X_LR):\n",
|
|
" Xtrain = X_LR.iloc[list(train_index)]\n",
|
|
" Ytrain = Y.iloc[list(train_index)]\n",
|
|
" xtest = X_LR.iloc[list(test_index)]\n",
|
|
" ytest = Y.iloc[list(test_index)]\n",
|
|
" DTR = DecisionTreeRegressor(max_depth=5)\n",
|
|
" model = AdaBoostRegressor(n_estimators=50, base_estimator=DTR)\n",
|
|
" model.fit(Xtrain, Ytrain)\n",
|
|
" Ytrain_pred = model.predict(Xtrain)\n",
|
|
" ytest_pred = model.predict(xtest)\n",
|
|
" train_score.append(np.round(model.score(Xtrain, Ytrain), 2))\n",
|
|
" test_score.append(np.round(model.score(xtest, ytest), 2))\n",
|
|
" train_MAE.append(mean_absolute_error(Ytrain, Ytrain_pred))\n",
|
|
" test_MAE.append(mean_absolute_error(ytest, ytest_pred))\n",
|
|
" return (\n",
|
|
" np.round(np.average(train_score), 2),\n",
|
|
" np.round(np.std(train_score), 2),\n",
|
|
" np.round(np.average(test_score), 2),\n",
|
|
" np.round(np.std(test_score), 2),\n",
|
|
" np.round(np.average(train_MAE), 2),\n",
|
|
" np.round(np.std(train_MAE), 2),\n",
|
|
" np.round(np.average(test_MAE), 2),\n",
|
|
" np.round(np.std(test_MAE), 2),\n",
|
|
" )"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"id": "7cdc8c89c7104fffa095e18ddfef8986",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import warnings\n",
|
|
"\n",
|
|
"warnings.simplefilter(\"ignore\")\n",
|
|
"rest = [i for i in aminacid_order if i not in main]\n",
|
|
"with open(\"AdaBoost_result.csv\", \"w\") as out_file:\n",
|
|
" out_file.write(\n",
|
|
" \"#AminoAcid, r2_avg_train, r2_std_train, r2_avg_test, r2_std_test, MAE_avg_train, MAE_std_train, MAE_avg_test, MAE_std_test\"\n",
|
|
" + \"\\n\"\n",
|
|
" )\n",
|
|
" for i in rest:\n",
|
|
" (\n",
|
|
" r2_avg_train,\n",
|
|
" r2_std_train,\n",
|
|
" r2_avg_test,\n",
|
|
" r2_std_test,\n",
|
|
" MAE_avg_train,\n",
|
|
" MAE_std_train,\n",
|
|
" MAE_avg_test,\n",
|
|
" MAE_std_test,\n",
|
|
" ) = AminoAcid_LR(i)\n",
|
|
" out_file.write(\n",
|
|
" \"%s %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\\n\"\n",
|
|
" % (\n",
|
|
" i,\n",
|
|
" r2_avg_train,\n",
|
|
" r2_std_train,\n",
|
|
" r2_avg_test,\n",
|
|
" r2_std_test,\n",
|
|
" MAE_avg_train,\n",
|
|
" MAE_std_train,\n",
|
|
" MAE_avg_test,\n",
|
|
" MAE_std_test,\n",
|
|
" )\n",
|
|
" + \"\\n\"\n",
|
|
" )"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "b118ea5561624da68c537baed56e602f",
|
|
"metadata": {},
|
|
"source": [
|
|
"fig, ax = plt.subplots(len(G0), 2, figsize=(10, 25))\n",
|
|
"fig.subplots_adjust(left=0.06, right=1, wspace=0.2)\n",
|
|
"X_LR=X[main]\n",
|
|
"def Mary_LR():\n",
|
|
" for i in G0:\n",
|
|
" Y=X[i]\n",
|
|
" model= LinearRegression()\n",
|
|
" model.fit(X_LR, Y)\n",
|
|
" ymodel=model.predict(X_LR)\n",
|
|
" print(np.round(cross_val_score(model,X_LR,Y,cv=3).mean(),2))\n",
|
|
" ax[G0.index(i)][0].scatter(Y,ymodel,c='black',label=i)\n",
|
|
" ax[G0.index(i)][0].plot(Y,Y,c='gray',label=np.round(model.score(X_LR,Y),2))\n",
|
|
" ax[G0.index(i)][0].legend()\n",
|
|
" ax[G0.index(i)][1].bar(main,model.coef_, color='b',width=0.5)\n",
|
|
" plt.savefig(\"LR-G0.pdf\")\n",
|
|
"Mary_LR()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"id": "938c804e27f84196a10c8828c723f798",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"/tmp/ipykernel_5390/1595004092.py:9: UserWarning: FixedFormatter should only be used together with FixedLocator\n",
|
|
" ax[0].set_yticklabels(df.AminoAcid, fontsize=12)\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"image/png": "\n",
|
|
"text/plain": [
|
|
"<Figure size 1200x1000 with 4 Axes>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"flatui = [\n",
|
|
" \"black\",\n",
|
|
" \"grey\",\n",
|
|
" \"rosybrown\",\n",
|
|
" \"darkred\",\n",
|
|
" \"indianred\",\n",
|
|
" \"salmon\",\n",
|
|
" \"red\",\n",
|
|
" \"coral\",\n",
|
|
" \"tan\",\n",
|
|
" \"gold\",\n",
|
|
" \"y\",\n",
|
|
" \"olive\",\n",
|
|
" \"yellow\",\n",
|
|
" \"greenyellow\",\n",
|
|
" \"darkgreen\",\n",
|
|
" \"lime\",\n",
|
|
" \"lightseagreen\",\n",
|
|
" \"aqua\",\n",
|
|
" \"lightsteelblue\",\n",
|
|
" \"deepskyblue\",\n",
|
|
" \"royalblue\",\n",
|
|
" \"slateblue\",\n",
|
|
" \"mediumpurple\",\n",
|
|
" \"darkviolet\",\n",
|
|
" \"violet\",\n",
|
|
" \"magenta\",\n",
|
|
" \"deeppink\",\n",
|
|
" \"pink\",\n",
|
|
" \"crimson\",\n",
|
|
"]\n",
|
|
"fig, ax = plt.subplots(1, 4, figsize=(12, 10), sharey=True)\n",
|
|
"fig.subplots_adjust(left=0.06, right=0.95, wspace=0.1)\n",
|
|
"df = pd.DataFrame(\n",
|
|
" pd.read_csv(\n",
|
|
" \"AdaBoost_result.csv\",\n",
|
|
" header=None,\n",
|
|
" comment=\"#\",\n",
|
|
" sep=\"\\s+\",\n",
|
|
" squeeze=True,\n",
|
|
" names=[\n",
|
|
" \"AminoAcid\",\n",
|
|
" \"r2_avg_train\",\n",
|
|
" \"r2_std_train\",\n",
|
|
" \"r2_avg_test\",\n",
|
|
" \"r2_std_test\",\n",
|
|
" \"MAE_avg_train\",\n",
|
|
" \"MAE_std_train\",\n",
|
|
" \"MAE_avg_test\",\n",
|
|
" \"MAE_std_test\",\n",
|
|
" ],\n",
|
|
" )\n",
|
|
")\n",
|
|
"ax[0].barh(df.AminoAcid, df.r2_avg_train, xerr=df.r2_std_train, color=flatui)\n",
|
|
"ax[0].set_xlabel(\"R2 score Train\", fontsize=14)\n",
|
|
"ax[0].set_yticklabels(df.AminoAcid, fontsize=12)\n",
|
|
"ax[0].set_xlim(0, 1.05)\n",
|
|
"\n",
|
|
"ax[1].barh(df.AminoAcid, df.r2_avg_test, xerr=df.r2_std_test, color=flatui)\n",
|
|
"ax[1].set_xlabel(\"R2 score Test\", fontsize=14)\n",
|
|
"ax[1].set_xlim(-1, 1)\n",
|
|
"\n",
|
|
"ax[2].barh(df.AminoAcid, df.MAE_avg_train, xerr=df.MAE_std_train, color=flatui)\n",
|
|
"ax[2].set_xlabel(\"MAE (kJ/mol) Train\", fontsize=14)\n",
|
|
"\n",
|
|
"ax[3].barh(df.AminoAcid, df.MAE_avg_test, xerr=df.MAE_std_test, color=flatui)\n",
|
|
"ax[3].set_xlabel(\"MAE (kJ/mol) Test\", fontsize=14)\n",
|
|
"ax[3].set_xlim(0, 4)\n",
|
|
"\n",
|
|
"ax[0].set_ylabel(\"Biomolecules\", fontsize=14)\n",
|
|
"plt.savefig(\"AdaBoost_result.pdf\", format=\"pdf\", dpi=1000, bbox_inches=\"tight\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"id": "504fb2a444614c0babb325280ed9130a",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# modify the main\n",
|
|
"main = [\"ASP\", \"VAL\", \"PRO\", \"ETA\", \"PHO\"]\n",
|
|
"rest = [i for i in aminacid_order if i not in main]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"id": "59bbdb311c014d738909a11f9e486628",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def AminoAcid_LR(i):\n",
|
|
" X_LR = X[main]\n",
|
|
" Y = X[i]\n",
|
|
" rs = ShuffleSplit(n_splits=20, test_size=0.3, random_state=0)\n",
|
|
" train_score = []\n",
|
|
" test_score = []\n",
|
|
" train_MAE = []\n",
|
|
" test_MAE = []\n",
|
|
" for train_index, test_index in rs.split(X_LR):\n",
|
|
" Xtrain = X_LR.iloc[list(train_index)]\n",
|
|
" Ytrain = Y.iloc[list(train_index)]\n",
|
|
" xtest = X_LR.iloc[list(test_index)]\n",
|
|
" ytest = Y.iloc[list(test_index)]\n",
|
|
" DTR = DecisionTreeRegressor(max_depth=5)\n",
|
|
" model = AdaBoostRegressor(n_estimators=50, base_estimator=DTR)\n",
|
|
" model.fit(Xtrain, Ytrain)\n",
|
|
" Ytrain_pred = model.predict(Xtrain)\n",
|
|
" ytest_pred = model.predict(xtest)\n",
|
|
" train_score.append(np.round(model.score(Xtrain, Ytrain), 2))\n",
|
|
" test_score.append(np.round(model.score(xtest, ytest), 2))\n",
|
|
" train_MAE.append(mean_absolute_error(Ytrain, Ytrain_pred))\n",
|
|
" test_MAE.append(mean_absolute_error(ytest, ytest_pred))\n",
|
|
" return (\n",
|
|
" np.round(np.average(train_score), 2),\n",
|
|
" np.round(np.std(train_score), 2),\n",
|
|
" np.round(np.average(test_score), 2),\n",
|
|
" np.round(np.std(test_score), 2),\n",
|
|
" np.round(np.average(train_MAE), 2),\n",
|
|
" np.round(np.std(train_MAE), 2),\n",
|
|
" np.round(np.average(test_MAE), 2),\n",
|
|
" np.round(np.std(test_MAE), 2),\n",
|
|
" )"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"id": "b43b363d81ae4b689946ece5c682cd59",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import warnings\n",
|
|
"\n",
|
|
"warnings.simplefilter(\"ignore\")\n",
|
|
"rest = [i for i in aminacid_order if i not in main]\n",
|
|
"with open(\"AdaBoost_modify_result.csv\", \"w\") as out_file:\n",
|
|
" out_file.write(\n",
|
|
" \"#AminoAcid, r2_avg_train, r2_std_train, r2_avg_test, r2_std_test, MAE_avg_train, MAE_std_train, MAE_avg_test, MAE_std_test\"\n",
|
|
" + \"\\n\"\n",
|
|
" )\n",
|
|
" for i in rest:\n",
|
|
" (\n",
|
|
" r2_avg_train,\n",
|
|
" r2_std_train,\n",
|
|
" r2_avg_test,\n",
|
|
" r2_std_test,\n",
|
|
" MAE_avg_train,\n",
|
|
" MAE_std_train,\n",
|
|
" MAE_avg_test,\n",
|
|
" MAE_std_test,\n",
|
|
" ) = AminoAcid_LR(i)\n",
|
|
" out_file.write(\n",
|
|
" \"%s %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\\n\"\n",
|
|
" % (\n",
|
|
" i,\n",
|
|
" r2_avg_train,\n",
|
|
" r2_std_train,\n",
|
|
" r2_avg_test,\n",
|
|
" r2_std_test,\n",
|
|
" MAE_avg_train,\n",
|
|
" MAE_std_train,\n",
|
|
" MAE_avg_test,\n",
|
|
" MAE_std_test,\n",
|
|
" )\n",
|
|
" + \"\\n\"\n",
|
|
" )"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"id": "8a65eabff63a45729fe45fb5ade58bdc",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"/tmp/ipykernel_28522/836274381.py:12: UserWarning: FixedFormatter should only be used together with FixedLocator\n",
|
|
" ax[0].set_yticklabels(df.AminoAcid, fontsize=12)\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"image/png": "\n",
|
|
"text/plain": [
|
|
"<Figure size 1200x1000 with 4 Axes>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"flatui = [\n",
|
|
" \"black\",\n",
|
|
" \"grey\",\n",
|
|
" \"rosybrown\",\n",
|
|
" \"darkred\",\n",
|
|
" \"indianred\",\n",
|
|
" \"salmon\",\n",
|
|
" \"red\",\n",
|
|
" \"coral\",\n",
|
|
" \"tan\",\n",
|
|
" \"gold\",\n",
|
|
" \"y\",\n",
|
|
" \"olive\",\n",
|
|
" \"yellow\",\n",
|
|
" \"greenyellow\",\n",
|
|
" \"darkgreen\",\n",
|
|
" \"lime\",\n",
|
|
" \"lightseagreen\",\n",
|
|
" \"aqua\",\n",
|
|
" \"lightsteelblue\",\n",
|
|
" \"deepskyblue\",\n",
|
|
" \"royalblue\",\n",
|
|
" \"slateblue\",\n",
|
|
" \"violet\",\n",
|
|
" \"magenta\",\n",
|
|
" \"deeppink\",\n",
|
|
" \"pink\",\n",
|
|
" \"crimson\",\n",
|
|
"]\n",
|
|
"fig, ax = plt.subplots(1, 4, figsize=(12, 10), sharey=True)\n",
|
|
"fig.subplots_adjust(left=0.06, right=0.95, wspace=0.1)\n",
|
|
"df = pd.DataFrame(\n",
|
|
" pd.read_csv(\n",
|
|
" \"AdaBoost_modify_result.csv\",\n",
|
|
" header=None,\n",
|
|
" comment=\"#\",\n",
|
|
" sep=\"\\s+\",\n",
|
|
" squeeze=True,\n",
|
|
" names=[\n",
|
|
" \"AminoAcid\",\n",
|
|
" \"r2_avg_train\",\n",
|
|
" \"r2_std_train\",\n",
|
|
" \"r2_avg_test\",\n",
|
|
" \"r2_std_test\",\n",
|
|
" \"MAE_avg_train\",\n",
|
|
" \"MAE_std_train\",\n",
|
|
" \"MAE_avg_test\",\n",
|
|
" \"MAE_std_test\",\n",
|
|
" ],\n",
|
|
" )\n",
|
|
")\n",
|
|
"ax[0].barh(df.AminoAcid, df.r2_avg_train, xerr=df.r2_std_train, color=flatui)\n",
|
|
"ax[0].set_xlabel(\"R2 score Train\", fontsize=14)\n",
|
|
"ax[0].set_xlim(0, 1.05)\n",
|
|
"ax[0].set_yticklabels(df.AminoAcid, fontsize=12)\n",
|
|
"\n",
|
|
"ax[1].barh(df.AminoAcid, df.r2_avg_test, xerr=df.r2_std_test, color=flatui)\n",
|
|
"ax[1].set_xlabel(\"R2 score Test\", fontsize=14)\n",
|
|
"ax[1].set_xlim(0, 1.05)\n",
|
|
"ax[2].barh(df.AminoAcid, df.MAE_avg_train, xerr=df.MAE_std_train, color=flatui)\n",
|
|
"ax[2].set_xlabel(\"MAE (kJ/mol) Train\", fontsize=14)\n",
|
|
"ax[3].barh(df.AminoAcid, df.MAE_avg_test, xerr=df.MAE_std_test, color=flatui)\n",
|
|
"ax[3].set_xlabel(\"MAE (kJ/mol) Test\", fontsize=14)\n",
|
|
"ax[3].set_xlim(0, 4)\n",
|
|
"\n",
|
|
"ax[0].set_ylabel(\"Biomolecules\", fontsize=14)\n",
|
|
"plt.savefig(\"AdaBoost_modify_result.png\", format=\"png\", dpi=1000, bbox_inches=\"tight\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "c3933fab20d04ec698c2621248eb3be0",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Predicted vs Real vales for testing data set (one of 10)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 13,
|
|
"id": "4dd4641cc4064e0191573fe9c69df29b",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"main = [\"ASP\", \"VAL\", \"PRO\", \"ETA\", \"PHO\"]\n",
|
|
"rest = [i for i in aminacid_order if i not in main]\n",
|
|
"X_LR = X[main]\n",
|
|
"\n",
|
|
"\n",
|
|
"def Mary_pred(i):\n",
|
|
" Y = X[i]\n",
|
|
" Xtrain, xtest, Ytrain, ytest = train_test_split(\n",
|
|
" X_LR, Y, test_size=0.3, random_state=110\n",
|
|
" )\n",
|
|
" DTR = DecisionTreeRegressor(max_depth=5, random_state=1)\n",
|
|
" model = AdaBoostRegressor(n_estimators=50, base_estimator=DTR, random_state=1)\n",
|
|
" model.fit(Xtrain, Ytrain)\n",
|
|
" ytest_pred = model.predict(xtest)\n",
|
|
" test_score = np.round(model.score(xtest, ytest), 2)\n",
|
|
" d = {\n",
|
|
" \"Aminoacid\": i,\n",
|
|
" \"Predict\": list(ytest_pred),\n",
|
|
" \"Real\": list(ytest),\n",
|
|
" \"score\": test_score,\n",
|
|
" \"Method\": \"AdaBoost\",\n",
|
|
" }\n",
|
|
" return d"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 14,
|
|
"id": "8309879909854d7188b41380fd92a7c3",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df_list = []\n",
|
|
"for i in rest:\n",
|
|
" df = pd.DataFrame.from_dict(Mary_pred(i))\n",
|
|
" df_list.append(df)\n",
|
|
"result = pd.concat(df_list, ignore_index=True)\n",
|
|
"result.to_csv(\"AdaBoost_ypredict-yreal-110.csv\")"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.8.5"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|