Data Tokenization is a security technique that replaces sensitive data with non-sensitive tokens that have no intrinsic value, maintaining functionality while protecting confidential information.

What is Data Tokenization?

Tokenization is a process that converts sensitive data (such as credit card numbers, SSN, etc.) into tokens that do not reveal information about the original data, but maintain the ability to reference them securely.

Main Characteristics

Data Protection

  • Irreversible: Tokens cannot be reverted to original data
  • No Value: Tokens have no intrinsic value
  • Secure: Robust protection of sensitive data
  • Functional: Maintains system functionality

Regulatory Compliance

  • PCI DSS: Payment card standards compliance
  • GDPR: Privacy regulations compliance
  • HIPAA: Health regulations compliance
  • SOX: Financial regulations compliance

Scalability

  • High Volume: Handle large data volumes
  • Performance: High processing performance
  • Distributed: Distributed implementation
  • Cloud: Compatible with cloud environments

Tokenization Types

Reversible Tokenization

  • Mapping: Bidirectional mapping between data and tokens
  • Recovery: Possibility to recover original data
  • Use: Cases where access to original data is needed
  • Security: Lower security than irreversible

Irreversible Tokenization

  • Hash: Tokens generated through hash functions
  • Non-Recoverable: Original data cannot be recovered
  • Use: Cases where access to original data is not needed
  • Security: Higher security

Format-Preserving Tokenization

  • FPE: Format-Preserving Encryption
  • Format: Maintains original data format
  • Compatibility: Compatible with existing systems
  • Use: Systems requiring specific format

Technical Implementation

Simple Tokenization

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import hashlib
import secrets
import base64
from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
from cryptography.hazmat.primitives import hashes
from cryptography.hazmat.primitives.kdf.pbkdf2 import PBKDF2HMAC

class SimpleTokenization:
    def __init__(self, master_key):
        self.master_key = master_key
        self.token_map = {}
        self.reverse_map = {}
    
    def tokenize(self, sensitive_data):
        """Tokenize sensitive data"""
        # Generate unique token
        token = self.generate_token()
        
        # Store mapping (in production, use secure database)
        self.token_map[token] = sensitive_data
        self.reverse_map[sensitive_data] = token
        
        return token
    
    def detokenize(self, token):
        """Detokenize token"""
        if token not in self.token_map:
            raise ValueError("Invalid token")
        
        return self.token_map[token]
    
    def generate_token(self):
        """Generate unique token"""
        # Generate random token
        random_bytes = secrets.token_bytes(16)
        token = base64.urlsafe_b64encode(random_bytes).decode('utf-8')
        
        # Ensure uniqueness
        while token in self.token_map:
            random_bytes = secrets.token_bytes(16)
            token = base64.urlsafe_b64encode(random_bytes).decode('utf-8')
        
        return token

# Usage example
tokenizer = SimpleTokenization("master_key_123")
credit_card = "4111-1111-1111-1111"
token = tokenizer.tokenize(credit_card)
print(f"Token: {token}")
original = tokenizer.detokenize(token)
print(f"Original: {original}")

Encrypted Tokenization

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
from cryptography.hazmat.primitives import padding
import os

class EncryptedTokenization:
    def __init__(self, master_key):
        self.master_key = master_key
        self.cipher = algorithms.AES(self.master_key)
    
    def tokenize(self, sensitive_data):
        """Tokenize with encryption"""
        # Generate random IV
        iv = os.urandom(16)
        
        # Encrypt data
        cipher = Cipher(self.cipher, modes.CBC(iv))
        encryptor = cipher.encryptor()
        
        # Apply padding
        padder = padding.PKCS7(128).padder()
        padded_data = padder.update(sensitive_data.encode())
        padded_data += padder.finalize()
        
        # Encrypt
        encrypted_data = encryptor.update(padded_data) + encryptor.finalize()
        
        # Create token (IV + encrypted data)
        token = base64.urlsafe_b64encode(iv + encrypted_data).decode('utf-8')
        
        return token
    
    def detokenize(self, token):
        """Detokenize with decryption"""
        # Decode token
        token_data = base64.urlsafe_b64decode(token.encode('utf-8'))
        
        # Extract IV and encrypted data
        iv = token_data[:16]
        encrypted_data = token_data[16:]
        
        # Decrypt
        cipher = Cipher(self.cipher, modes.CBC(iv))
        decryptor = cipher.decryptor()
        
        decrypted_data = decryptor.update(encrypted_data) + decryptor.finalize()
        
        # Remove padding
        unpadder = padding.PKCS7(128).unpadder()
        unpadded_data = unpadder.update(decrypted_data)
        unpadded_data += unpadder.finalize()
        
        return unpadded_data.decode('utf-8')

# Usage example
encrypted_tokenizer = EncryptedTokenization(os.urandom(32))
credit_card = "4111-1111-1111-1111"
token = encrypted_tokenizer.tokenize(credit_card)
print(f"Encrypted token: {token}")
original = encrypted_tokenizer.detokenize(token)
print(f"Original: {original}")

Format-Preserving Tokenization (FPE)

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
class FormatPreservingTokenization:
    def __init__(self, master_key):
        self.master_key = master_key
    
    def tokenize_credit_card(self, credit_card):
        """Tokenize credit card maintaining format"""
        # Remove spaces and dashes
        clean_card = credit_card.replace(' ', '').replace('-', '')
        
        # Verify format (16 digits)
        if not clean_card.isdigit() or len(clean_card) != 16:
            raise ValueError("Invalid card format")
        
        # Generate token maintaining format
        token = self.generate_credit_card_token(clean_card)
        
        # Format as credit card
        formatted_token = f"{token[:4]}-{token[4:8]}-{token[8:12]}-{token[12:16]}"
        
        return formatted_token
    
    def generate_credit_card_token(self, credit_card):
        """Generate credit card token"""
        # Use hash to generate deterministic token
        hash_input = f"{credit_card}{self.master_key}".encode()
        hash_output = hashlib.sha256(hash_input).hexdigest()
        
        # Convert to digits
        token_digits = []
        for i in range(0, len(hash_output), 2):
            hex_pair = hash_output[i:i+2]
            digit = int(hex_pair, 16) % 10
            token_digits.append(str(digit))
        
        # Ensure it starts with valid digit for card
        if token_digits[0] == '0':
            token_digits[0] = '4'  # Visa
        
        return ''.join(token_digits[:16])

# Usage example
fpe_tokenizer = FormatPreservingTokenization("master_key_123")
credit_card = "4111-1111-1111-1111"
token = fpe_tokenizer.tokenize_credit_card(credit_card)
print(f"Token with format: {token}")

Specific Applications

PCI DSS Compliance

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
class PCITokenization:
    def __init__(self, pci_compliant_vault):
        self.vault = pci_compliant_vault
        self.sensitive_fields = ['pan', 'cvv', 'expiry_date']
    
    def tokenize_payment_data(self, payment_data):
        """Tokenize payment data for PCI DSS"""
        tokenized_data = {}
        
        for field in self.sensitive_fields:
            if field in payment_data:
                # Tokenize sensitive field
                token = self.vault.store_and_tokenize(
                    payment_data[field], 
                    field_type=field
                )
                tokenized_data[f"{field}_token"] = token
            else:
                tokenized_data[field] = payment_data[field]
        
        return tokenized_data
    
    def detokenize_payment_data(self, tokenized_data):
        """Detokenize payment data"""
        original_data = {}
        
        for field in self.sensitive_fields:
            token_field = f"{field}_token"
            if token_field in tokenized_data:
                # Detokenize field
                original_value = self.vault.retrieve_from_token(
                    tokenized_data[token_field]
                )
                original_data[field] = original_value
            else:
                original_data[field] = tokenized_data[field]
        
        return original_data

# Usage example
class PCIVault:
    def store_and_tokenize(self, data, field_type):
        # Implement secure storage
        token = f"tok_{field_type}_{secrets.token_hex(8)}"
        return token
    
    def retrieve_from_token(self, token):
        # Implement secure retrieval
        return "original_data"

pci_vault = PCIVault()
pci_tokenizer = PCITokenization(pci_vault)

payment_data = {
    'pan': '4111-1111-1111-1111',
    'cvv': '123',
    'expiry_date': '12/25',
    'cardholder_name': 'John Doe'
}

tokenized = pci_tokenizer.tokenize_payment_data(payment_data)
print(f"Tokenized data: {tokenized}")

PII Tokenization

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
class PIITokenization:
    def __init__(self, master_key):
        self.master_key = master_key
        self.pii_patterns = {
            'ssn': r'\d{3}-\d{2}-\d{4}',
            'email': r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}',
            'phone': r'\d{3}-\d{3}-\d{4}'
        }
    
    def tokenize_pii(self, text):
        """Tokenize PII in text"""
        import re
        
        tokenized_text = text
        
        for pii_type, pattern in self.pii_patterns.items():
            matches = re.finditer(pattern, text)
            for match in matches:
                original = match.group()
                token = self.generate_pii_token(original, pii_type)
                tokenized_text = tokenized_text.replace(original, token)
        
        return tokenized_text
    
    def generate_pii_token(self, pii_data, pii_type):
        """Generate token for PII"""
        # Generate deterministic token
        hash_input = f"{pii_data}{pii_type}{self.master_key}".encode()
        hash_output = hashlib.sha256(hash_input).hexdigest()
        
        # Create token with appropriate format
        if pii_type == 'ssn':
            token = f"XXX-XX-{hash_output[:4]}"
        elif pii_type == 'email':
            token = f"user_{hash_output[:8]}@example.com"
        elif pii_type == 'phone':
            token = f"XXX-XXX-{hash_output[:4]}"
        else:
            token = f"TOKEN_{hash_output[:8]}"
        
        return token

# Usage example
pii_tokenizer = PIITokenization("master_key_123")
text_with_pii = "John Doe's SSN is 123-45-6789 and email is john@example.com"
tokenized_text = pii_tokenizer.tokenize_pii(text_with_pii)
print(f"Tokenized text: {tokenized_text}")

Best Practices

Security

  • Secure Storage: Store tokens in secure systems
  • Controlled Access: Strict access control to tokens
  • Audit: Complete audit of operations
  • Encryption: Encrypt tokens at rest and in transit

Management

  • Policies: Clear tokenization policies
  • Procedures: Defined procedures
  • Monitoring: Continuous monitoring
  • Response: Incident response

Compliance

  • Standards: Standards compliance
  • Regulations: Regulatory compliance
  • Documentation: Complete documentation
  • Verification: Regular verification

Commercial Tools

HashiCorp Vault

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import hvac

class VaultTokenization:
    def __init__(self, vault_url, token):
        self.client = hvac.Client(url=vault_url, token=token)
    
    def tokenize_with_vault(self, data, path="transit"):
        """Tokenize using Vault"""
        try:
            response = self.client.secrets.transit.encrypt_data(
                name='tokenization-key',
                plaintext=data
            )
            return response['data']['ciphertext']
        except Exception as e:
            print(f"Error tokenizing with Vault: {e}")
            return None
    
    def detokenize_with_vault(self, token, path="transit"):
        """Detokenize using Vault"""
        try:
            response = self.client.secrets.transit.decrypt_data(
                name='tokenization-key',
                ciphertext=token
            )
            return response['data']['plaintext']
        except Exception as e:
            print(f"Error detokenizing with Vault: {e}")
            return None

# Usage example
vault_tokenizer = VaultTokenization('http://localhost:8200', 'my-token')
token = vault_tokenizer.tokenize_with_vault("sensitive_data")
original = vault_tokenizer.detokenize_with_vault(token)

References