Probability and Statistics are fundamental mathematical tools in cybersecurity for risk analysis, threat assessment, and data-driven decision making.

What are Probability and Statistics?

Probability studies uncertainty and random events, while statistics analyzes data to extract conclusions and patterns, both being essential for risk management in cybersecurity.

Fundamental Concepts

Probability

  • Definition: Measure of the possibility that an event occurs
  • Range: 0 ≤ P(A) ≤ 1
  • Events: Mutually exclusive, independent
  • Application: Risk analysis

Descriptive Statistics

  • Central Tendency Measures: Mean, median, mode
  • Dispersion Measures: Variance, standard deviation
  • Distributions: Normal, binomial, Poisson
  • Application: Security data analysis

Inferential Statistics

  • Hypotheses: Hypothesis testing
  • Confidence Intervals: Parameter estimation
  • Regression: Relationship analysis
  • Application: Threat prediction

Probability Distributions

Normal Distribution

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

def normal_distribution_example():
    """Normal distribution example"""
    # Parameters
    mu = 0  # Mean
    sigma = 1  # Standard deviation
    
    # Generate data
    x = np.linspace(-4, 4, 100)
    y = stats.norm.pdf(x, mu, sigma)
    
    # Calculate probabilities
    prob_less_than_1 = stats.norm.cdf(1, mu, sigma)
    prob_between = stats.norm.cdf(1, mu, sigma) - stats.norm.cdf(-1, mu, sigma)
    
    print(f"P(X < 1) = {prob_less_than_1:.4f}")
    print(f"P(-1 < X < 1) = {prob_between:.4f}")
    
    return x, y

# Usage example
x, y = normal_distribution_example()

Binomial Distribution

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
def binomial_distribution_example():
    """Binomial distribution example"""
    # Parameters
    n = 100  # Number of trials
    p = 0.1  # Success probability
    
    # Calculate probabilities
    prob_exactly_10 = stats.binom.pmf(10, n, p)
    prob_at_most_10 = stats.binom.cdf(10, n, p)
    prob_at_least_10 = 1 - stats.binom.cdf(9, n, p)
    
    print(f"P(X = 10) = {prob_exactly_10:.4f}")
    print(f"P(X ≤ 10) = {prob_at_most_10:.4f}")
    print(f"P(X ≥ 10) = {prob_at_least_10:.4f}")
    
    return n, p

# Usage example
n, p = binomial_distribution_example()

Poisson Distribution

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
def poisson_distribution_example():
    """Poisson distribution example"""
    # Parameters
    lambda_param = 5  # Occurrence rate
    
    # Calculate probabilities
    prob_exactly_5 = stats.poisson.pmf(5, lambda_param)
    prob_at_most_5 = stats.poisson.cdf(5, lambda_param)
    prob_more_than_5 = 1 - stats.poisson.cdf(5, lambda_param)
    
    print(f"P(X = 5) = {prob_exactly_5:.4f}")
    print(f"P(X ≤ 5) = {prob_at_most_5:.4f}")
    print(f"P(X > 5) = {prob_more_than_5:.4f}")
    
    return lambda_param

# Usage example
lambda_param = poisson_distribution_example()

Risk Analysis

Risk Assessment

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
def risk_assessment(probability, impact):
    """Risk assessment"""
    risk_score = probability * impact
    
    if risk_score >= 0.8:
        risk_level = "High"
    elif risk_score >= 0.5:
        risk_level = "Medium"
    else:
        risk_level = "Low"
    
    return risk_score, risk_level

# Usage example
prob = 0.7  # Occurrence probability
impact = 0.9  # Impact (0-1)
risk_score, risk_level = risk_assessment(prob, impact)
print(f"Risk: {risk_level} (Score: {risk_score:.2f})")

Vulnerability Analysis

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
def vulnerability_analysis(vulnerabilities):
    """Statistical vulnerability analysis"""
    # Descriptive statistics
    mean_cvss = np.mean([v['cvss'] for v in vulnerabilities])
    std_cvss = np.std([v['cvss'] for v in vulnerabilities])
    median_cvss = np.median([v['cvss'] for v in vulnerabilities])
    
    # Classification by severity
    critical = len([v for v in vulnerabilities if v['cvss'] >= 9.0])
    high = len([v for v in vulnerabilities if 7.0 <= v['cvss'] < 9.0])
    medium = len([v for v in vulnerabilities if 4.0 <= v['cvss'] < 7.0])
    low = len([v for v in vulnerabilities if v['cvss'] < 4.0])
    
    return {
        'mean_cvss': mean_cvss,
        'std_cvss': std_cvss,
        'median_cvss': median_cvss,
        'critical': critical,
        'high': high,
        'medium': medium,
        'low': low
    }

# Usage example
vulnerabilities = [
    {'name': 'CVE-2023-001', 'cvss': 9.8},
    {'name': 'CVE-2023-002', 'cvss': 7.5},
    {'name': 'CVE-2023-003', 'cvss': 5.2},
    {'name': 'CVE-2023-004', 'cvss': 3.1}
]
analysis = vulnerability_analysis(vulnerabilities)
print(f"Analysis: {analysis}")

Security Data Analysis

Anomaly Detection

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
def anomaly_detection(data, threshold=2):
    """Anomaly detection using Z-score"""
    mean = np.mean(data)
    std = np.std(data)
    
    z_scores = np.abs((data - mean) / std)
    anomalies = z_scores > threshold
    
    return anomalies, z_scores

# Usage example
data = np.random.normal(100, 15, 1000)  # Normal data
data[50] = 200  # Anomaly
anomalies, z_scores = anomaly_detection(data)
print(f"Anomalies detected: {np.sum(anomalies)}")

Trend Analysis

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
def trend_analysis(time_series):
    """Trend analysis in time series"""
    from scipy import stats
    
    # Linear regression
    x = np.arange(len(time_series))
    slope, intercept, r_value, p_value, std_err = stats.linregress(x, time_series)
    
    # Prediction
    future_x = np.arange(len(time_series), len(time_series) + 10)
    future_y = slope * future_x + intercept
    
    return {
        'slope': slope,
        'r_squared': r_value**2,
        'p_value': p_value,
        'future_predictions': future_y
    }

# Usage example
time_series = np.random.normal(100, 10, 100) + np.linspace(0, 20, 100)
trend = trend_analysis(time_series)
print(f"Trend: {trend['slope']:.4f}")
print(f"R²: {trend['r_squared']:.4f}")

Hypothesis Testing

Student’s t-test

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
def t_test(sample1, sample2, alpha=0.05):
    """Student's t-test for two samples"""
    from scipy import stats
    
    # t-test
    t_stat, p_value = stats.ttest_ind(sample1, sample2)
    
    # Decision
    if p_value < alpha:
        decision = "Reject H0"
    else:
        decision = "Do not reject H0"
    
    return {
        't_statistic': t_stat,
        'p_value': p_value,
        'decision': decision
    }

# Usage example
sample1 = np.random.normal(100, 15, 50)
sample2 = np.random.normal(105, 15, 50)
result = t_test(sample1, sample2)
print(f"Result: {result}")

Chi-square Test

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
def chi_square_test(observed, expected):
    """Chi-square test"""
    from scipy import stats
    
    chi2_stat, p_value = stats.chisquare(observed, expected)
    
    return {
        'chi2_statistic': chi2_stat,
        'p_value': p_value
    }

# Usage example
observed = [10, 15, 20, 25]
expected = [12, 18, 18, 22]
result = chi_square_test(observed, expected)
print(f"Chi-square: {result}")

Machine Learning for Security

Threat Classification

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

def threat_classification(features, labels):
    """Threat classification using Random Forest"""
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        features, labels, test_size=0.2, random_state=42
    )
    
    # Train model
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    # Predict
    y_pred = model.predict(X_test)
    
    # Evaluate
    report = classification_report(y_test, y_pred)
    
    return model, report

# Usage example
# features = np.random.randn(1000, 10)  # Features
# labels = np.random.randint(0, 3, 1000)  # Labels (0: benign, 1: malware, 2: phishing)
# model, report = threat_classification(features, labels)
# print(report)

Intrusion Detection

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
def intrusion_detection(data, threshold=0.5):
    """Intrusion detection using statistical analysis"""
    # Calculate metrics
    mean = np.mean(data)
    std = np.std(data)
    
    # Detect anomalies
    z_scores = np.abs((data - mean) / std)
    intrusions = z_scores > threshold
    
    # Calculate performance metrics
    true_positives = np.sum(intrusions)
    false_positives = np.sum(intrusions) - np.sum(intrusions)  # Simplified
    
    return {
        'intrusions_detected': intrusions,
        'true_positives': true_positives,
        'false_positives': false_positives
    }

# Usage example
data = np.random.normal(100, 15, 1000)
data[50:60] = np.random.normal(200, 20, 10)  # Intrusions
result = intrusion_detection(data)
print(f"Intrusions detected: {np.sum(result['intrusions_detected'])}")

Performance Analysis

Performance Metrics

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
def performance_metrics(y_true, y_pred):
    """Calculate performance metrics"""
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
    
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1
    }

# Usage example
y_true = [0, 1, 1, 0, 1, 0, 1, 1, 0, 1]
y_pred = [0, 1, 0, 0, 1, 0, 1, 0, 0, 1]
metrics = performance_metrics(y_true, y_pred)
print(f"Metrics: {metrics}")
  • Number Theory - Complementary mathematical foundations
  • Abstract Algebra - Complementary mathematical structures
  • Cryptanalysis - Analysis that uses probability and statistics
  • Machine Learning - Techniques that use probability and statistics
  • CISO - Role that oversees probability and statistics
  • General Cybersecurity - Discipline that includes probability and statistics
  • Security Breaches - Incidents analyzed with probability and statistics
  • Attack Vectors - Attacks analyzed with probability and statistics
  • Incident Response - Process that includes probability and statistics
  • SIEM - System that uses probability and statistics
  • SOAR - Automation that uses probability and statistics
  • EDR - Tool that uses probability and statistics
  • Firewall - Device that uses probability and statistics
  • VPN - Connection that uses probability and statistics
  • Dashboards - Visualization of probability and statistics metrics
  • Logs - Logs analyzed with probability and statistics

References