This Python code is a tool for encoding sentences into...

September 15, 2025 at 04:12 AM

#!/usr/bin/env python3
"""
Vector Speech (vspeech) - Sentence to Vector Compression with Encryption
Usage:
    python vspeech.py -1 -k key.txt                    # Encrypt mode
    python vspeech.py -2 1output.txt -k key.txt        # Decrypt mode
"""

import argparse
import os
import sys
import json
import sentencepiece as spm
from cryptography.fernet import Fernet
import base64

def setup_tokenizer():
    """Setup SentencePiece tokenizer. For testing, we'll use a pre-trained model."""
    # For now, let's create a simple training corpus for demonstration
    corpus_file = "temp_corpus.txt"
    
    # Create a basic corpus if it doesn't exist
    if not os.path.exists("tokenizer.model"):
        print("Training SentencePiece model...")
        
        # Basic coordination phrases corpus
        corpus = [
            "meet at location alpha at fourteen thirty",
            "meet at location beta at fifteen hundred hours", 
            "rendezvous at checkpoint charlie at zero nine hundred",
            "gather at point delta at twenty one thirty",
            "assemble at site echo at zero seven hundred hours",
            "convene at area foxtrot at twelve hundred",
            "rally at position golf at sixteen forty five",
            "muster at zone hotel at eighteen thirty",
            "collect at base india at zero six hundred",
            "unite at camp juliet at twenty three hundred",
            "the passphrase is JUMBO",
            "the passphrase is TIGER", 
            "the passphrase is EAGLE",
            "code word is ALPHA",
            "code word is BRAVO",
            "code word is CHARLIE",
            "proceed with caution",
            "abort mission immediately",
            "all clear proceed",
            "hold position",
            "move to fallback location",
            "mission accomplished",
            "requesting backup",
            "target acquired",
            "package secured"
        ]
        
        # Write corpus to file
        with open(corpus_file, 'w') as f:
            for sentence in corpus:
                f.write(sentence + '\n')
        
        # Train SentencePiece model
        spm.SentencePieceTrainer.train(
            input=corpus_file,
            model_prefix='tokenizer',
            vocab_size=500,
            model_type='bpe'
        )
        
        # Clean up temp file
        os.remove(corpus_file)
        
    # Load the trained model
    sp = spm.SentencePieceProcessor()
    sp.load('tokenizer.model')
    return sp

def generate_key():
    """Generate a new encryption key"""
    return Fernet.generate_key()

def load_key(key_file):
    """Load encryption key from file"""
    try:
        with open(key_file, 'rb') as f:
            return f.read()
    except FileNotFoundError:
        print(f"Key file {key_file} not found!")
        sys.exit(1)

def save_key(key, filename):
    """Save encryption key to file"""
    with open(filename, 'wb') as f:
        f.write(key)

def encrypt_mode(key_file):
    """Encrypt mode: input sentence -> vectors -> encrypted file"""
    
    # Setup tokenizer
    sp = setup_tokenizer()
    
    # Get user input
    sentence = input("Enter sentence to encode: ").strip()
    
    if not sentence:
        print("No input provided!")
        sys.exit(1)
    
    # Tokenize sentence
    tokens = sp.encode_as_ids(sentence)
    print(f"Tokenized to vectors: {tokens}")
    
    # Load or generate encryption key
    if os.path.exists(key_file):
        key = load_key(key_file)
    else:
        print(f"Key file {key_file} not found. Generating new key...")
        key = generate_key()
        save_key(key, key_file)
        print(f"New key saved to {key_file}")
    
    # Encrypt the tokens
    fernet = Fernet(key)
    tokens_json = json.dumps(tokens)
    encrypted_tokens = fernet.encrypt(tokens_json.encode())
    
    # Save encrypted tokens to file
    with open('1output.txt', 'wb') as f:
        f.write(encrypted_tokens)
    
    print(f"Encrypted vectors saved to 1output.txt")
    print(f"Original: '{sentence}'")
    print(f"Vectors: {tokens}")

def decrypt_mode(input_file, key_file):
    """Decrypt mode: encrypted file -> vectors -> sentence"""
    
    # Setup tokenizer
    sp = setup_tokenizer()
    
    # Load encryption key
    key = load_key(key_file)
    fernet = Fernet(key)
    
    # Read and decrypt the file
    try:
        with open(input_file, 'rb') as f:
            encrypted_data = f.read()
    except FileNotFoundError:
        print(f"Input file {input_file} not found!")
        sys.exit(1)
    
    try:
        # Decrypt the tokens
        decrypted_data = fernet.decrypt(encrypted_data)
        tokens = json.loads(decrypted_data.decode())
        print(f"Decrypted vectors: {tokens}")
        
        # Decode tokens back to sentence
        decoded_sentence = sp.decode_ids(tokens)
        print(f"Decoded sentence: '{decoded_sentence}'")
        
        # Save to output file
        with open('2output.txt', 'w') as f:
            f.write(decoded_sentence)
        
        print(f"Decoded sentence saved to 2output.txt")
        
    except Exception as e:
        print(f"Decryption failed: {e}")
        sys.exit(1)

def main():
    parser = argparse.ArgumentParser(description='Vector Speech - Sentence compression and encryption')
    parser.add_argument('-1', '--encrypt', action='store_true', help='Encrypt mode')
    parser.add_argument('-2', '--decrypt', metavar='INPUT_FILE', help='Decrypt mode with input file')
    parser.add_argument('-k', '--key', required=True, help='Key file path')
    
    args = parser.parse_args()
    
    if args.encrypt and args.decrypt:
        print("Error: Cannot use both encrypt and decrypt modes")
        sys.exit(1)
    
    if not args.encrypt and not args.decrypt:
        print("Error: Must specify either encrypt (-1) or decrypt (-2) mode")
        sys.exit(1)
    
    if args.encrypt:
        encrypt_mode(args.key)
    elif args.decrypt:
        decrypt_mode(args.decrypt, args.key)

if __name__ == "__main__":
    main()

This Python code is a tool for encoding sentences into numerical vectors using a tokenizer, encrypting these vectors, and allowing them to be decrypted back into sentences. It uses the SentencePiece library for tokenization (converting sentences into numerical vector representations) and the cryptography.fernet module for encryption and decryption. Here's a detailed breakdown of the code's functionality:

Key Components and What It Does

Sentence Tokenization with SentencePiece:
- The script uses SentencePiece to turn sentences into tokenized vectors (lists of numerical IDs).
- It trains a tokenizer model (tokenizer.model) using a predefined corpus of sentences.
- When tokenizing, the sentence is encoded into a list of numerical tokens.
Encryption:
- The numerical tokens (vectors) obtained from tokenization are serialized into JSON format.
- These tokenized data are encrypted using the Fernet symmetric encryption scheme provided by the cryptography library.
- The encrypted data (ciphertext) is then saved to a file (1output.txt).
Decryption:
- Reads an encrypted file, decrypts it using the specified encryption key, and retrieves the original vector representation.
- The tokenized vector is then converted back to the original sentence using the trained SentencePiece tokenizer.
Encryption Key Handling:
- The encryption key is loaded from a specified file (key.txt).
- If the key file doesn’t exist during encryption, a new key is generated, saved, and then used for encryption.
Command-Line Arguments:
- The script can be run in two modes, as specified by command-line flags:
  - Encryption Mode (-1): Takes user input (a sentence), encodes it into vectors, encrypts it, and saves it to 1output.txt.
  - Decryption Mode (-2): Reads an encrypted file (e.g., 1output.txt), decrypts it, decodes the vectors back to the original sentence, and saves the result in 2output.txt.
- Both modes require a key file (key.txt).

Detailed Steps

Setup:
- The setup_tokenizer function ensures a SentencePiece tokenizer model is available. If the tokenizer.model file doesn’t exist, the script trains one using a predefined corpus of example sentences.
Encrypt Mode (encrypt_mode):
- Prompts the user to enter a sentence.
- Uses the SentencePiece tokenizer to convert the input sentence into numerical vectors.
- Encrypts these vectors using a key (loaded from or saved to key.txt).
- Outputs the encrypted data to 1output.txt.
Decrypt Mode (decrypt_mode):
- Loads the encryption key from the key file (key.txt).
- Reads encrypted data from the file (like 1output.txt).
- Decrypts the data to retrieve the numerical vectors.
- Converts the vectors back into a readable sentence using the SentencePiece tokenizer.
- Saves the reconstructed sentence to 2output.txt.
Key Generation and Saving:
- If the encryption key file doesn’t exist, the script creates a new key using Fernet's generate_key() function. This ensures the encryption process can proceed securely.

Usage Instructions

The script is designed to be run via the command line with the following options:

Encrypt a Sentence:
```
python vspeech.py -1 -k key.txt
```
- Prompts you to input a sentence.
- Encrypts it after tokenization and saves the result in 1output.txt.
Decrypt an Encrypted File:
```
python vspeech.py -2 1output.txt -k key.txt
```
- Decrypts 1output.txt using the key from key.txt.
- Reconstructs the original sentence and saves it in 2output.txt.

Example Workflow

Encrypt a Sentence:
- Input: "meet at location alpha at fourteen thirty"
- Tokenized: [1, 21, 14, 33, 42, 19] (example tokens)
- Encrypted and saved as 1output.txt.
Decrypt the File:
- Reads 1output.txt.
- Decrypts and decodes back to: "meet at location alpha at fourteen thirty".

Applications

This script demonstrates:

Secure storage of sensitive information (like sentences) as encrypted data.
Compression of text into tokenized vectors for efficient processing.
A use case for SentencePiece in customizing natural language preprocessing pipelines.

Generate your own explanations

Download our vscode extension

Read other generated explanations

Built by @thebuilderjr