Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 113 additions & 0 deletions machine_learning/multinomial_naive_bayes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
"""
Multinomial Naive Bayes Classifier implementation.

This module implements Multinomial Naive Bayes from scratch without using
external machine learning libraries. It is commonly used for text
classification tasks such as spam detection.

References:
https://en.wikipedia.org/wiki/Naive_Bayes_classifier#Multinomial_naive_bayes
"""

import math


class MultinomialNaiveBayes:
"""
Multinomial Naive Bayes classifier.
"""

def __init__(self, alpha: float = 1.0) -> None:
"""
Initialize the classifier.

:param alpha: Laplace smoothing parameter
"""
if alpha <= 0:
raise ValueError("Alpha must be greater than 0")

self.alpha = alpha
self.class_priors: dict[int, float] = {}
self.feature_log_prob: dict[int, list[float]] = {}
self.num_features: int = 0

def fit(self, features: list[list[int]], labels: list[int]) -> None:
"""
Train the Multinomial Naive Bayes classifier.

:param features: Feature matrix (counts of features)
:param labels: Class labels
:raises ValueError: If input sizes mismatch

>>> model = MultinomialNaiveBayes()
>>> X = [[2, 1], [1, 1], [0, 2]]
>>> y = [0, 0, 1]
>>> model.fit(X, y)
"""
if len(features) != len(labels):
raise ValueError("Features and labels must have the same length")

if not features:
raise ValueError("Feature matrix must not be empty")

self.num_features = len(features[0])

separated: dict[int, list[list[int]]] = {}
for row, label in zip(features, labels):
separated.setdefault(label, []).append(row)

total_samples = len(labels)

for label, rows in separated.items():
self.class_priors[label] = math.log(len(rows) / total_samples)

feature_counts = [0] * self.num_features
total_count = 0

for row in rows:
for index, value in enumerate(row):
feature_counts[index] += value
total_count += value

self.feature_log_prob[label] = [
math.log(
(count + self.alpha)
/ (total_count + self.alpha * self.num_features)
)
for count in feature_counts
]

def predict(self, features: list[list[int]]) -> list[int]:
"""
Predict class labels for input features.

:param features: Feature matrix
:return: Predicted labels

>>> model = MultinomialNaiveBayes()
>>> X = [[2, 1], [1, 1], [0, 2]]
>>> y = [0, 0, 1]
>>> model.fit(X, y)
>>> model.predict([[1, 0], [0, 2]])
[0, 1]
"""
predictions: list[int] = []

for row in features:
class_scores: dict[int, float] = {}

for label in self.class_priors:
score = self.class_priors[label]

for index, value in enumerate(row):
score += value * self.feature_log_prob[label][index]

class_scores[label] = score

predicted_label = max(
class_scores.items(),
key=lambda item: item[1],
)[0]
predictions.append(predicted_label)

return predictions
107 changes: 107 additions & 0 deletions machine_learning/naive_bayes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
"""
Naive Bayes Classifier implementation.

This module implements Gaussian Naive Bayes from scratch without using
external machine learning libraries.

References:
https://en.wikipedia.org/wiki/Naive_Bayes_classifier
"""

from typing import Dict, List, Tuple

Check failure on line 11 in machine_learning/naive_bayes.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (UP035)

machine_learning/naive_bayes.py:11:1: UP035 `typing.Tuple` is deprecated, use `tuple` instead

Check failure on line 11 in machine_learning/naive_bayes.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (UP035)

machine_learning/naive_bayes.py:11:1: UP035 `typing.List` is deprecated, use `list` instead

Check failure on line 11 in machine_learning/naive_bayes.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (UP035)

machine_learning/naive_bayes.py:11:1: UP035 `typing.Dict` is deprecated, use `dict` instead

Check failure on line 11 in machine_learning/naive_bayes.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (UP035)

machine_learning/naive_bayes.py:11:1: UP035 `typing.Tuple` is deprecated, use `tuple` instead

Check failure on line 11 in machine_learning/naive_bayes.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (UP035)

machine_learning/naive_bayes.py:11:1: UP035 `typing.List` is deprecated, use `list` instead

Check failure on line 11 in machine_learning/naive_bayes.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (UP035)

machine_learning/naive_bayes.py:11:1: UP035 `typing.Dict` is deprecated, use `dict` instead

Check failure on line 11 in machine_learning/naive_bayes.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (UP035)

machine_learning/naive_bayes.py:11:1: UP035 `typing.Tuple` is deprecated, use `tuple` instead

Check failure on line 11 in machine_learning/naive_bayes.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (UP035)

machine_learning/naive_bayes.py:11:1: UP035 `typing.List` is deprecated, use `list` instead

Check failure on line 11 in machine_learning/naive_bayes.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (UP035)

machine_learning/naive_bayes.py:11:1: UP035 `typing.Dict` is deprecated, use `dict` instead
import math

Check failure on line 12 in machine_learning/naive_bayes.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (I001)

machine_learning/naive_bayes.py:11:1: I001 Import block is un-sorted or un-formatted

Check failure on line 12 in machine_learning/naive_bayes.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (I001)

machine_learning/naive_bayes.py:11:1: I001 Import block is un-sorted or un-formatted

Check failure on line 12 in machine_learning/naive_bayes.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (I001)

machine_learning/naive_bayes.py:11:1: I001 Import block is un-sorted or un-formatted


def gaussian_probability(x: float, mean: float, variance: float) -> float:
"""
Calculate Gaussian probability density.

>>> round(gaussian_probability(1.0, 1.0, 1.0), 3)
0.399
>>> gaussian_probability(1.0, 1.0, 0.0)
0.0
"""
if variance == 0.0:
return 0.0

exponent = math.exp(-((x - mean) ** 2) / (2.0 * variance))
coefficient = 1.0 / math.sqrt(2.0 * math.pi * variance)
return coefficient * exponent


class GaussianNaiveBayes:
"""
Gaussian Naive Bayes classifier.
"""

def __init__(self) -> None:
self.class_priors: Dict[int, float] = {}

Check failure on line 38 in machine_learning/naive_bayes.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (UP006)

machine_learning/naive_bayes.py:38:28: UP006 Use `dict` instead of `Dict` for type annotation

Check failure on line 38 in machine_learning/naive_bayes.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (UP006)

machine_learning/naive_bayes.py:38:28: UP006 Use `dict` instead of `Dict` for type annotation

Check failure on line 38 in machine_learning/naive_bayes.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (UP006)

machine_learning/naive_bayes.py:38:28: UP006 Use `dict` instead of `Dict` for type annotation
self.means: Dict[int, List[float]] = {}

Check failure on line 39 in machine_learning/naive_bayes.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (UP006)

machine_learning/naive_bayes.py:39:31: UP006 Use `list` instead of `List` for type annotation

Check failure on line 39 in machine_learning/naive_bayes.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (UP006)

machine_learning/naive_bayes.py:39:21: UP006 Use `dict` instead of `Dict` for type annotation

Check failure on line 39 in machine_learning/naive_bayes.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (UP006)

machine_learning/naive_bayes.py:39:31: UP006 Use `list` instead of `List` for type annotation

Check failure on line 39 in machine_learning/naive_bayes.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (UP006)

machine_learning/naive_bayes.py:39:21: UP006 Use `dict` instead of `Dict` for type annotation

Check failure on line 39 in machine_learning/naive_bayes.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (UP006)

machine_learning/naive_bayes.py:39:31: UP006 Use `list` instead of `List` for type annotation

Check failure on line 39 in machine_learning/naive_bayes.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (UP006)

machine_learning/naive_bayes.py:39:21: UP006 Use `dict` instead of `Dict` for type annotation
self.variances: Dict[int, List[float]] = {}

Check failure on line 40 in machine_learning/naive_bayes.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (UP006)

machine_learning/naive_bayes.py:40:35: UP006 Use `list` instead of `List` for type annotation

Check failure on line 40 in machine_learning/naive_bayes.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (UP006)

machine_learning/naive_bayes.py:40:25: UP006 Use `dict` instead of `Dict` for type annotation

Check failure on line 40 in machine_learning/naive_bayes.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (UP006)

machine_learning/naive_bayes.py:40:35: UP006 Use `list` instead of `List` for type annotation

Check failure on line 40 in machine_learning/naive_bayes.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (UP006)

machine_learning/naive_bayes.py:40:25: UP006 Use `dict` instead of `Dict` for type annotation

Check failure on line 40 in machine_learning/naive_bayes.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (UP006)

machine_learning/naive_bayes.py:40:35: UP006 Use `list` instead of `List` for type annotation

Check failure on line 40 in machine_learning/naive_bayes.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (UP006)

machine_learning/naive_bayes.py:40:25: UP006 Use `dict` instead of `Dict` for type annotation

def fit(self, features: List[List[float]], labels: List[int]) -> None:

Check failure on line 42 in machine_learning/naive_bayes.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (UP006)

machine_learning/naive_bayes.py:42:29: UP006 Use `list` instead of `List` for type annotation

Check failure on line 42 in machine_learning/naive_bayes.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (UP006)

machine_learning/naive_bayes.py:42:29: UP006 Use `list` instead of `List` for type annotation

Check failure on line 42 in machine_learning/naive_bayes.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (UP006)

machine_learning/naive_bayes.py:42:29: UP006 Use `list` instead of `List` for type annotation
"""
Train the Gaussian Naive Bayes classifier.

:param features: Feature matrix
:param labels: Class labels
:raises ValueError: If input sizes mismatch

>>> model = GaussianNaiveBayes()
>>> model.fit([[1.0], [2.0], [3.0]], [0, 0, 1])
"""
if len(features) != len(labels):
raise ValueError("Features and labels must have the same length")

separated: Dict[int, List[List[float]]] = {}
for feature_vector, label in zip(features, labels):
separated.setdefault(label, []).append(feature_vector)

total_samples = len(labels)

for label, rows in separated.items():
self.class_priors[label] = len(rows) / total_samples

columns = list(zip(*rows))
self.means[label] = [sum(col) / len(col) for col in columns]
self.variances[label] = [
sum((x - mean) ** 2 for x in col) / len(col)
for col, mean in zip(columns, self.means[label])
]

def predict(self, features: List[List[float]]) -> List[int]:
"""
Predict class labels for input features.

:param features: Feature matrix
:return: Predicted labels

>>> model = GaussianNaiveBayes()
>>> X = [[1.0], [2.0], [3.0], [4.0]]
>>> y = [0, 0, 1, 1]
>>> model.fit(X, y)
>>> model.predict([[1.5], [3.5]])
[0, 1]
"""
predictions: List[int] = []

for row in features:
scores: List[Tuple[int, float]] = []

for label in self.class_priors:
log_likelihood = math.log(self.class_priors[label])

for index, value in enumerate(row):
probability = gaussian_probability(
value,
self.means[label][index],
self.variances[label][index],
)
if probability > 0.0:
log_likelihood += math.log(probability)

scores.append((label, log_likelihood))

predictions.append(max(scores, key=lambda pair: pair[1])[0])

return predictions