WebGPU e Machine Learning no Browser em 2026: IA Rodando Localmente com JavaScript

Olá HaWkers, imagine rodar modelos de IA diretamente no seu browser, sem enviar dados para servidores, com performance próxima de aplicações nativas. Em 2026, isso não é mais ficção científica - é WebGPU.

Vamos explorar como essa tecnologia está transformando o que é possível fazer com JavaScript no browser.

O Que É WebGPU e Por Que Importa

WebGPU: O Sucessor do WebGL

// Evolução das APIs gráficas no browser

const graphicsAPIEvolution = {
  webgl1: {
    year: 2011,
    based_on: 'OpenGL ES 2.0',
    purpose: 'Gráficos 3D no browser',
    limitation: 'API antiga, sem compute shaders'
  },

  webgl2: {
    year: 2017,
    based_on: 'OpenGL ES 3.0',
    improvements: 'Transform feedback, instancing',
    limitation: 'Ainda baseada em API legacy'
  },

  webgpu: {
    year: '2023-2026 (adoção massiva)',
    based_on: 'Vulkan, Metal, DirectX 12',
    purpose: 'Gráficos E computação GPU',
    breakthrough: 'Compute shaders nativos = ML possível!'
  }
};

// Por que WebGPU é revolucionário para ML
const webgpuForML = {
  computeShaders: {
    what: 'Programas que rodam na GPU para cálculos paralelos',
    why: 'ML = milhões de operações matemáticas paralelas',
    benefit: 'GPU é 10-100x mais rápida que CPU para isso'
  },

  performance: {
    webgl: 'Hacks para simular compute com texturas',
    webgpu: 'Compute pipelines dedicados',
    improvement: '3-10x mais rápido que WebGL para ML'
  },

  memoryAccess: {
    webgl: 'Limitado, indireto via texturas',
    webgpu: 'Acesso direto a buffers de memória',
    impact: 'Modelos maiores, menos overhead'
  }
};

Suporte em Browsers (Janeiro 2026)

// Status de suporte WebGPU

const browserSupport2026 = {
  chrome: {
    status: 'Totalmente suportado',
    since: 'Chrome 113 (2023)',
    platforms: ['Windows', 'macOS', 'Linux', 'ChromeOS', 'Android']
  },

  edge: {
    status: 'Totalmente suportado',
    since: 'Edge 113 (2023)',
    platforms: ['Windows', 'macOS', 'Linux']
  },

  safari: {
    status: 'Suportado',
    since: 'Safari 17 (2023)',
    platforms: ['macOS Sonoma+', 'iOS 17+', 'visionOS']
  },

  firefox: {
    status: 'Suportado',
    since: 'Firefox 145 (2025)',
    platforms: ['macOS', 'Windows (em progresso)', 'Linux (em progresso)']
  },

  coverage2026: '~90% dos usuários desktop têm WebGPU'
};

// Detectando WebGPU
async function checkWebGPUSupport(): Promise<boolean> {
  if (!navigator.gpu) {
    console.log('WebGPU não suportado neste browser');
    return false;
  }

  const adapter = await navigator.gpu.requestAdapter();
  if (!adapter) {
    console.log('Nenhum adaptador GPU disponível');
    return false;
  }

  const device = await adapter.requestDevice();
  console.log('WebGPU disponível!', device.limits);
  return true;
}

Machine Learning com WebGPU na Prática

ONNX Runtime Web: O Padrão da Indústria

// ONNX Runtime Web com WebGPU

import * as ort from 'onnxruntime-web';

// Configurar para usar WebGPU
ort.env.wasm.numThreads = 4;

async function runInferenceWithWebGPU(imageData: ImageData) {
  // Criar sessão com WebGPU
  const session = await ort.InferenceSession.create(
    './models/resnet50.onnx',
    {
      executionProviders: ['webgpu'],  // Usar GPU!
      graphOptimizationLevel: 'all'
    }
  );

  // Preparar input
  const tensor = new ort.Tensor(
    'float32',
    preprocessImage(imageData),
    [1, 3, 224, 224]  // NCHW format
  );

  // Rodar inferência
  const startTime = performance.now();
  const results = await session.run({ input: tensor });
  const endTime = performance.now();

  console.log(`Inferência em ${endTime - startTime}ms`);

  // Processar output
  const predictions = results.output.data;
  return getTopKPredictions(predictions, 5);
}

function preprocessImage(imageData: ImageData): Float32Array {
  const { data, width, height } = imageData;
  const float32Data = new Float32Array(3 * 224 * 224);

  // Resize e normalize (simplificado)
  for (let c = 0; c < 3; c++) {
    for (let h = 0; h < 224; h++) {
      for (let w = 0; w < 224; w++) {
        const srcIdx = (h * width + w) * 4 + c;
        const dstIdx = c * 224 * 224 + h * 224 + w;
        // Normalize para ImageNet
        float32Data[dstIdx] = (data[srcIdx] / 255.0 - 0.485) / 0.229;
      }
    }
  }

  return float32Data;
}

Transformers.js: Hugging Face no Browser

// Transformers.js com WebGPU (2026)

import { pipeline, env } from '@xenova/transformers';

// Forçar uso de WebGPU
env.backends.onnx.wasm.proxy = false;
env.backends.onnx.wasm.numThreads = 4;

// Classificação de texto com BERT
async function classifyText(text: string) {
  const classifier = await pipeline(
    'sentiment-analysis',
    'Xenova/distilbert-base-uncased-finetuned-sst-2-english',
    { device: 'webgpu' }  // Usar GPU!
  );

  const result = await classifier(text);
  return result;
  // { label: 'POSITIVE', score: 0.9998 }
}

// Geração de texto com GPT-2
async function generateText(prompt: string) {
  const generator = await pipeline(
    'text-generation',
    'Xenova/gpt2',
    { device: 'webgpu' }
  );

  const result = await generator(prompt, {
    max_new_tokens: 50,
    temperature: 0.7,
    do_sample: true
  });

  return result[0].generated_text;
}

// Embeddings para busca semântica
async function createEmbeddings(texts: string[]) {
  const extractor = await pipeline(
    'feature-extraction',
    'Xenova/all-MiniLM-L6-v2',
    { device: 'webgpu' }
  );

  const embeddings = await extractor(texts, {
    pooling: 'mean',
    normalize: true
  });

  return embeddings.tolist();
}

Rodando LLMs Pequenos no Browser

// LLM local com WebLLM

import { CreateMLCEngine } from '@anthropic-ai/mlc-llm-web';

async function chatWithLocalLLM() {
  // Inicializar engine com modelo quantizado
  const engine = await CreateMLCEngine({
    model: 'Phi-2-q4f16_1',  // ~1.5GB quantizado
    device: 'webgpu',
    cacheUrl: 'indexeddb://llm-cache'
  });

  // Chat completions (API compatível com OpenAI)
  const response = await engine.chat.completions.create({
    messages: [
      { role: 'system', content: 'Você é um assistente útil.' },
      { role: 'user', content: 'Explique WebGPU em uma frase.' }
    ],
    temperature: 0.7,
    max_tokens: 100
  });

  console.log(response.choices[0].message.content);
  // "WebGPU é uma API moderna que permite acesso
  //  de baixo nível à GPU para gráficos e computação
  //  de alta performance no browser."
}

// Streaming de respostas
async function streamResponse(prompt: string) {
  const engine = await getEngine();  // Reutilizar engine

  const stream = await engine.chat.completions.create({
    messages: [{ role: 'user', content: prompt }],
    stream: true
  });

  for await (const chunk of stream) {
    const content = chunk.choices[0]?.delta?.content || '';
    process.stdout.write(content);  // Ou atualizar UI
  }
}

Casos de Uso Práticos

1. Classificação de Imagens em Tempo Real

// Classificador de imagens offline

class ImageClassifier {
  private session: ort.InferenceSession | null = null;
  private labels: string[] = [];

  async initialize() {
    // Carregar modelo e labels
    this.session = await ort.InferenceSession.create(
      './models/mobilenet_v3.onnx',
      { executionProviders: ['webgpu'] }
    );

    const response = await fetch('./models/imagenet_labels.json');
    this.labels = await response.json();
  }

  async classify(video: HTMLVideoElement): Promise<string> {
    if (!this.session) throw new Error('Não inicializado');

    // Capturar frame do vídeo
    const canvas = document.createElement('canvas');
    canvas.width = 224;
    canvas.height = 224;
    const ctx = canvas.getContext('2d')!;
    ctx.drawImage(video, 0, 0, 224, 224);

    const imageData = ctx.getImageData(0, 0, 224, 224);
    const tensor = this.imageToTensor(imageData);

    // Inferência
    const results = await this.session.run({ input: tensor });
    const probs = results.output.data as Float32Array;

    // Top prediction
    const maxIdx = probs.indexOf(Math.max(...probs));
    return this.labels[maxIdx];
  }

  private imageToTensor(imageData: ImageData): ort.Tensor {
    // ... preprocessamento
    return new ort.Tensor('float32', data, [1, 3, 224, 224]);
  }
}

// Uso com webcam
async function startWebcamClassification() {
  const classifier = new ImageClassifier();
  await classifier.initialize();

  const video = document.getElementById('webcam') as HTMLVideoElement;
  const stream = await navigator.mediaDevices.getUserMedia({ video: true });
  video.srcObject = stream;

  // Classificar a cada 100ms
  setInterval(async () => {
    const label = await classifier.classify(video);
    document.getElementById('result')!.textContent = label;
  }, 100);
}

2. Tradução Offline

// Tradução no browser sem servidor

import { pipeline } from '@xenova/transformers';

class OfflineTranslator {
  private translator: any = null;

  async initialize(sourceLang: string, targetLang: string) {
    const modelId = `Xenova/opus-mt-${sourceLang}-${targetLang}`;

    this.translator = await pipeline(
      'translation',
      modelId,
      {
        device: 'webgpu',
        // Cache no IndexedDB
        cache: { type: 'indexeddb' }
      }
    );
  }

  async translate(text: string): Promise<string> {
    if (!this.translator) throw new Error('Não inicializado');

    const result = await this.translator(text, {
      max_length: 512
    });

    return result[0].translation_text;
  }
}

// Uso
const translator = new OfflineTranslator();
await translator.initialize('en', 'pt');  // Inglês → Português

const translated = await translator.translate(
  'WebGPU enables machine learning in the browser'
);
console.log(translated);
// "WebGPU permite aprendizado de máquina no navegador"

3. Busca Semântica Local

// Busca semântica com embeddings locais

import { pipeline } from '@xenova/transformers';

class SemanticSearch {
  private embedder: any = null;
  private documents: string[] = [];
  private embeddings: number[][] = [];

  async initialize() {
    this.embedder = await pipeline(
      'feature-extraction',
      'Xenova/all-MiniLM-L6-v2',
      { device: 'webgpu' }
    );
  }

  async indexDocuments(docs: string[]) {
    this.documents = docs;

    // Gerar embeddings para todos os documentos
    const output = await this.embedder(docs, {
      pooling: 'mean',
      normalize: true
    });

    this.embeddings = output.tolist();
  }

  async search(query: string, topK: number = 5): Promise<SearchResult[]> {
    // Embedding da query
    const queryOutput = await this.embedder([query], {
      pooling: 'mean',
      normalize: true
    });
    const queryEmbedding = queryOutput.tolist()[0];

    // Calcular similaridade com todos os documentos
    const scores = this.embeddings.map((docEmb, idx) => ({
      index: idx,
      score: this.cosineSimilarity(queryEmbedding, docEmb)
    }));

    // Ordenar por similaridade
    scores.sort((a, b) => b.score - a.score);

    return scores.slice(0, topK).map(s => ({
      document: this.documents[s.index],
      score: s.score
    }));
  }

  private cosineSimilarity(a: number[], b: number[]): number {
    let dot = 0;
    for (let i = 0; i < a.length; i++) {
      dot += a[i] * b[i];
    }
    return dot;  // Vetores já normalizados
  }
}

interface SearchResult {
  document: string;
  score: number;
}

Performance e Limitações

Benchmarks Reais

// Comparação de performance (Janeiro 2026)

const performanceComparison = {
  imageClassification: {
    model: 'MobileNetV3',
    inputSize: '224x224',
    results: {
      cpu_wasm: '150ms',
      webgl: '45ms',
      webgpu: '12ms'  // 12x mais rápido que CPU!
    }
  },

  textGeneration: {
    model: 'Phi-2 (2.7B params, quantizado)',
    metric: 'tokens por segundo',
    results: {
      cpu_wasm: '2 tokens/s',
      webgpu: '25 tokens/s'  // Usável para chat!
    }
  },

  embeddings: {
    model: 'all-MiniLM-L6-v2',
    batchSize: 32,
    results: {
      cpu_wasm: '800ms',
      webgpu: '120ms'
    }
  }
};

Limitações Atuais

// O que ainda não funciona bem

const limitations2026 = {
  modelSize: {
    issue: 'Browsers têm limite de memória (~4GB)',
    impact: 'LLMs grandes (7B+) não funcionam bem',
    workaround: 'Use modelos quantizados (q4, q8)'
  },

  training: {
    issue: 'WebGPU é bom para inferência, não treinamento',
    impact: 'Fine-tuning pesado não é viável',
    workaround: 'Treinar server-side, inferir no browser'
  },

  coldStart: {
    issue: 'Primeiro carregamento baixa modelo inteiro',
    impact: 'Modelos grandes = espera inicial longa',
    workaround: 'Cache em IndexedDB, loading progressivo'
  },

  compatibility: {
    issue: 'Nem todos os browsers/devices suportam',
    impact: '~10% dos usuários sem WebGPU',
    workaround: 'Fallback para WASM/WebGL'
  }
};

// Implementando fallback gracioso
async function createInferenceSession(modelPath: string) {
  const providers = [];

  // Tentar WebGPU primeiro
  if (await checkWebGPUSupport()) {
    providers.push('webgpu');
  }

  // Fallback para WebGL
  providers.push('webgl');

  // Último recurso: WASM (CPU)
  providers.push('wasm');

  return ort.InferenceSession.create(modelPath, {
    executionProviders: providers
  });
}

Conclusão

WebGPU está tornando ML no browser uma realidade prática em 2026:

O que já funciona bem:

Classificação de imagens em tempo real
Embeddings e busca semântica
LLMs pequenos (até ~3B params quantizados)
Tradução e NLP offline
Detecção de objetos

O que ainda é desafiador:

LLMs muito grandes (7B+)
Treinamento de modelos
Usuários com hardware antigo

Benefícios chave:

Privacidade: Dados nunca saem do device
Latência: Sem round-trip para servidor
Offline: Funciona sem internet
Custo: Sem custo de GPU cloud

Se você está construindo aplicações que podem se beneficiar de ML, WebGPU é uma tecnologia que vale explorar agora.

Para entender mais sobre performance no JavaScript moderno, leia: VoidZero 2026: Toolchain Rust para JavaScript.