Skip to content

Commit 096859a

Browse files
committed
Use a custom SQL tokenizer
1 parent 95f2f41 commit 096859a

File tree

11 files changed

+100
-37
lines changed

11 files changed

+100
-37
lines changed

Dockerfile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ COPY api/api.py /app
1313
COPY api/pyproject.toml /app
1414
COPY api/poetry.lock /app
1515
COPY dataset/${dataset} /app
16+
COPY training/sql_tokenizer.py /app/
17+
COPY training/sql_tokenizer_vocab.json /app/
1618
COPY sqli_model/ /app/sqli_model/
1719
RUN pip install --disable-pip-version-check poetry
1820
RUN poetry install --no-root

api/api.py

Lines changed: 13 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,8 @@
11
from flask import Flask, jsonify, request
22
import tensorflow as tf
3-
from tensorflow.keras.preprocessing.sequence import pad_sequences
4-
from tensorflow.keras.preprocessing.text import Tokenizer
53
import pandas as pd
64
import os
5+
from sql_tokenizer import SQLTokenizer # Import SQLTokenizer
76

87
app = Flask(__name__)
98

@@ -12,11 +11,11 @@
1211
MAX_LEN = 100
1312
DATASET_PATH = os.getenv("DATASET_PATH", "dataset/sqli_dataset1.csv")
1413
MODEL_PATH = os.getenv("MODEL_PATH", "/app/sqli_model/3/")
15-
DATASET = pd.read_csv(DATASET_PATH)
1614

17-
# Tokenizer setup
18-
TOKENIZER = Tokenizer(num_words=MAX_WORDS, filters="")
19-
TOKENIZER.fit_on_texts(DATASET["Query"])
15+
# Load dataset and initialize SQLTokenizer
16+
DATASET = pd.read_csv(DATASET_PATH)
17+
sql_tokenizer = SQLTokenizer(max_words=MAX_WORDS, max_len=MAX_LEN)
18+
sql_tokenizer.fit_on_texts(DATASET["Query"]) # Fit tokenizer on dataset
2019

2120
# Load the model using tf.saved_model.load and get the serving signature
2221
loaded_model = tf.saved_model.load(MODEL_PATH)
@@ -26,9 +25,8 @@
2625
def warm_up_model():
2726
"""Sends a dummy request to the model to 'warm it up'."""
2827
dummy_query = "SELECT * FROM users WHERE id = 1"
29-
query_seq = TOKENIZER.texts_to_sequences([dummy_query])
30-
query_vec = pad_sequences(query_seq, maxlen=MAX_LEN)
31-
input_tensor = tf.convert_to_tensor(query_vec, dtype=tf.float32)
28+
query_seq = sql_tokenizer.texts_to_sequences([dummy_query])
29+
input_tensor = tf.convert_to_tensor(query_seq, dtype=tf.float32)
3230
_ = model_predict(input_tensor) # Make a dummy prediction to initialize the model
3331
print("Model warmed up and ready to serve requests.")
3432

@@ -39,27 +37,26 @@ def predict():
3937
return jsonify({"error": "No query provided"}), 400
4038

4139
try:
42-
# Tokenize and pad the input query
40+
# Tokenize and pad the input query using SQLTokenizer
4341
query = request.json["query"]
44-
query_seq = TOKENIZER.texts_to_sequences([query])
45-
query_vec = pad_sequences(query_seq, maxlen=MAX_LEN)
46-
47-
# Convert input to tensor
48-
input_tensor = tf.convert_to_tensor(query_vec, dtype=tf.float32)
42+
query_seq = sql_tokenizer.texts_to_sequences([query])
43+
input_tensor = tf.convert_to_tensor(query_seq, dtype=tf.float32)
4944

5045
# Use the loaded model's serving signature to make the prediction
5146
prediction = model_predict(input_tensor)
5247

48+
# Check for valid output and extract the result
5349
if "output_0" not in prediction or prediction["output_0"].get_shape() != [1, 1]:
5450
return jsonify({"error": "Invalid model output"}), 500
5551

52+
# Extract confidence and return the response
5653
return jsonify(
5754
{
5855
"confidence": float("%.4f" % prediction["output_0"].numpy()[0][0]),
5956
}
6057
)
6158
except Exception as e:
62-
# TODO: Log the error and return a proper error message
59+
# Log the error and return a proper error message
6360
return jsonify({"error": str(e)}), 500
6461

6562

sqli_model/3/fingerprint.pb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
��������=�������ʒ���月�� �����Ϗ�(���������2
1+
���־���������月�� �����Ϗ�(�������2

sqli_model/3/saved_model.pb

-24 Bytes
Binary file not shown.
-18.3 MB
Binary file not shown.
-2 Bytes
Binary file not shown.

training/sql_tokenizer.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
# sql_tokenizer.py
2+
import re
3+
import json
4+
from tensorflow.keras.preprocessing.sequence import pad_sequences
5+
6+
7+
class SQLTokenizer:
8+
def __init__(self, max_words=10000, max_len=100):
9+
self.max_words = max_words
10+
self.max_len = max_len
11+
self.token_index = {}
12+
13+
def tokenize(self, query):
14+
# Define a regex pattern for SQL tokens (operators, punctuation, keywords)
15+
pattern = r"[\w']+|[=><!]+|--|/\*|\*/|;|\(|\)|,|\*|\||\s+"
16+
tokens = re.findall(pattern, query.lower())
17+
return tokens
18+
19+
def fit_on_texts(self, queries):
20+
# Build a token index based on the provided queries
21+
all_tokens = set()
22+
for query in queries:
23+
tokens = self.tokenize(query)
24+
all_tokens.update(tokens)
25+
# Limit to max_words
26+
all_tokens = list(all_tokens)[: self.max_words]
27+
self.token_index = {token: i + 1 for i, token in enumerate(all_tokens)}
28+
29+
def texts_to_sequences(self, queries):
30+
# Convert queries to sequences of token IDs
31+
sequences = []
32+
for query in queries:
33+
tokens = self.tokenize(query)
34+
sequence = [self.token_index.get(token, 0) for token in tokens]
35+
sequences.append(sequence)
36+
return pad_sequences(sequences, maxlen=self.max_len)
37+
38+
def save_token_index(self, filepath):
39+
with open(filepath, "w") as f:
40+
json.dump(self.token_index, f)
41+
42+
def load_token_index(self, filepath):
43+
with open(filepath, "r") as f:
44+
self.token_index = json.load(f)

training/sql_tokenizer_vocab.json

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

training/test_train.py

Lines changed: 30 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,32 @@
1+
import os
12
import pandas as pd
23
import pytest
3-
from tensorflow.keras.preprocessing.text import Tokenizer
44
from tensorflow.keras.preprocessing.sequence import pad_sequences
55
from tensorflow.keras.layers import TFSMLayer
6-
6+
from sql_tokenizer import SQLTokenizer
7+
from tensorflow.keras.preprocessing.text import Tokenizer # For old tokenizer
78

89
MAX_WORDS = 10000
910
MAX_LEN = 100
11+
TOKENIZER_VOCAB_PATH = "sql_tokenizer_vocab.json" # Path to saved vocabulary
12+
1013
MODELV1 = {
1114
"dataset": "dataset/sqli_dataset1.csv",
1215
"model_path": "sqli_model/1",
1316
"index": 0,
17+
"use_sql_tokenizer": False,
1418
}
1519
MODELV2 = {
1620
"dataset": "dataset/sqli_dataset2.csv",
1721
"model_path": "sqli_model/2",
1822
"index": 1,
23+
"use_sql_tokenizer": False,
1924
}
2025
MODELV3 = {
2126
"dataset": "dataset/sqli_dataset2.csv",
2227
"model_path": "sqli_model/3",
2328
"index": 2,
29+
"use_sql_tokenizer": True,
2430
}
2531

2632

@@ -46,9 +52,23 @@ def model(request):
4652
model_path = prefix + request.param["model_path"]
4753
sqli_model = TFSMLayer(model_path, call_endpoint="serving_default")
4854

49-
# Tokenizer setup
50-
tokenizer = Tokenizer(num_words=MAX_WORDS, filters="")
51-
tokenizer.fit_on_texts(data["Query"])
55+
# Select the appropriate tokenizer
56+
if request.param["use_sql_tokenizer"]:
57+
# Use SQLTokenizer for MODELV3
58+
tokenizer = SQLTokenizer(max_words=MAX_WORDS, max_len=MAX_LEN)
59+
60+
# Load saved vocabulary if available
61+
if os.path.exists(TOKENIZER_VOCAB_PATH):
62+
tokenizer.load_token_index(TOKENIZER_VOCAB_PATH)
63+
else:
64+
tokenizer.fit_on_texts(data["Query"])
65+
tokenizer.save_token_index(
66+
TOKENIZER_VOCAB_PATH
67+
) # Save for future consistency
68+
else:
69+
# Use the old Keras Tokenizer for MODELV1 and MODELV2
70+
tokenizer = Tokenizer(num_words=MAX_WORDS, filters="")
71+
tokenizer.fit_on_texts(data["Query"])
5272

5373
return {
5474
"tokenizer": tokenizer,
@@ -60,10 +80,10 @@ def model(request):
6080
@pytest.mark.parametrize(
6181
"sample",
6282
[
63-
("select * from users where id=1 or 1=1;", [0.9202, 0.974, 0.0022]),
64-
("select * from users where id='1' or 1=1--", [0.9202, 0.974, 0.0022]),
83+
("select * from users where id=1 or 1=1;", [0.9202, 0.974, 0.3179]),
84+
("select * from users where id='1' or 1=1--", [0.9202, 0.974, 0.3179]),
6585
("select * from users", [0.00077, 0.0015, 0.0231]),
66-
("select * from users where id=10000", [0.1483, 0.8893, 0.0008]),
86+
("select * from users where id=10000", [0.1483, 0.8893, 0.7307]),
6787
("select '1' union select 'a'; -- -'", [0.9999, 0.9732, 0.0139]),
6888
(
6989
"select '' union select 'malicious php code' \\g /var/www/test.php; -- -';",
@@ -76,7 +96,7 @@ def model(request):
7696
],
7797
)
7898
def test_sqli_model(model, sample):
79-
# Vectorize the sample
99+
# Tokenize and pad the sample using the selected tokenizer
80100
sample_seq = model["tokenizer"].texts_to_sequences([sample[0]])
81101
sample_vec = pad_sequences(sample_seq, maxlen=MAX_LEN)
82102

@@ -91,4 +111,5 @@ def test_sqli_model(model, sample):
91111
f"Predicted: {predicted_value:.4f}, Expected: {sample[1][model['index']]:.4f}"
92112
)
93113

114+
# Check that prediction matches expected value within tolerance
94115
assert predicted_value == pytest.approx(sample[1][model["index"]], abs=0.05)

training/train_v3.py

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,6 @@
22
import os
33
import pandas as pd
44
import tensorflow as tf
5-
from tensorflow.keras.preprocessing.text import Tokenizer
6-
from tensorflow.keras.preprocessing.sequence import pad_sequences
75
from tensorflow.keras.models import Sequential
86
from tensorflow.keras.layers import (
97
Bidirectional,
@@ -21,6 +19,7 @@
2119
from sklearn.utils.class_weight import compute_class_weight
2220
import numpy as np
2321
import matplotlib.pyplot as plt
22+
from sql_tokenizer import SQLTokenizer
2423

2524

2625
def load_data(file_path):
@@ -33,11 +32,11 @@ def load_data(file_path):
3332

3433

3534
def preprocess_text(data, max_words=10000, max_len=100):
36-
"""Tokenize and pad text data."""
37-
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
35+
"""Tokenize and pad text data using SQLTokenizer."""
36+
tokenizer = SQLTokenizer(max_words=max_words)
3837
tokenizer.fit_on_texts(data["Query"])
3938
sequences = tokenizer.texts_to_sequences(data["Query"])
40-
return pad_sequences(sequences, maxlen=max_len), tokenizer
39+
return sequences, tokenizer
4140

4241

4342
def build_model(input_dim, output_dim=128):
@@ -97,19 +96,18 @@ def plot_history(history):
9796

9897
if __name__ == "__main__":
9998
if len(sys.argv) != 3:
100-
print("Usage: python train.py <input_file> <output_dir>")
99+
print("Usage: python train_v3.py <input_file> <output_dir>")
101100
sys.exit(1)
102101

103-
# Constants
104102
MAX_WORDS = 10000
105103
MAX_LEN = 100
106104
EPOCHS = 50
107105
BATCH_SIZE = 32
108106

109107
# Load and preprocess data
110108
data = load_data(sys.argv[1])
111-
X, tokenizer = preprocess_text(data)
112-
y = data["Label"].values # Convert to NumPy array to avoid KeyError in KFold
109+
X, tokenizer = preprocess_text(data, max_words=MAX_WORDS)
110+
y = data["Label"].values # Convert to NumPy array for compatibility with KFold
113111

114112
# Initialize cross-validation
115113
k_folds = 5
@@ -120,7 +118,7 @@ def plot_history(history):
120118
print(f"Training fold {fold}/{k_folds}")
121119

122120
# Split the data
123-
X_train, X_val = X[train_idx], X[val_idx]
121+
X_train, X_val = np.array(X)[train_idx], np.array(X)[val_idx]
124122
y_train, y_val = y[train_idx], y[val_idx]
125123

126124
# Compute class weights to handle imbalance
@@ -130,7 +128,7 @@ def plot_history(history):
130128
class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}
131129

132130
# Build and train the model
133-
model = build_model(input_dim=len(tokenizer.word_index) + 1)
131+
model = build_model(input_dim=len(tokenizer.token_index) + 1)
134132
early_stopping = EarlyStopping(
135133
monitor="val_loss", patience=5, restore_best_weights=True
136134
)

0 commit comments

Comments
 (0)