#include "freertos/FreeRTOS.h"
#include "freertos/task.h"
#include "esp_heap_caps.h"
#include "tensorflow/lite/micro/micro_interpreter.h"
#include "tensorflow/lite/micro/micro_mutable_op_resolver.h"
#include "tensorflow/lite/schema/schema_generated.h"
#include "person_detect_model.h"
static const char *TAG = "cam_classify";
/* Camera pins (AI-Thinker ESP32-CAM) */
static const char *CLASS_LABELS[] = {"no_person", "person"};
/* TFLite Micro arena - allocated in PSRAM for large models */
#define TENSOR_ARENA_SIZE (150 * 1024)
static uint8_t *s_tensor_arena = NULL;
/* ---- Camera initialization ---- */
static esp_err_t camera_init(void)
camera_config_t config = {
.pin_pwdn = CAM_PIN_PWDN,
.pin_reset = CAM_PIN_RESET,
.pin_xclk = CAM_PIN_XCLK,
.pin_sccb_sda = CAM_PIN_SIOD,
.pin_sccb_scl = CAM_PIN_SIOC,
.pin_vsync = CAM_PIN_VSYNC,
.pin_href = CAM_PIN_HREF,
.pin_pclk = CAM_PIN_PCLK,
.xclk_freq_hz = 20000000,
.ledc_timer = LEDC_TIMER_0,
.ledc_channel = LEDC_CHANNEL_0,
.pixel_format = PIXFORMAT_GRAYSCALE,
.frame_size = FRAMESIZE_96X96,
.fb_location = CAMERA_FB_IN_PSRAM,
.grab_mode = CAMERA_GRAB_WHEN_EMPTY,
esp_err_t err = esp_camera_init(&config);
ESP_LOGE(TAG, "Camera init failed: 0x%x", err);
sensor_t *s = esp_camera_sensor_get();
ESP_LOGI(TAG, "Camera initialized: %dx%d grayscale",
/* ---- Flash LED ---- */
static void flash_led_init(void)
gpio_config_t io_conf = {
.pin_bit_mask = (1ULL << FLASH_LED_PIN),
.mode = GPIO_MODE_OUTPUT,
gpio_set_level(FLASH_LED_PIN, 0);
static void flash_led_pulse(int duration_ms)
gpio_set_level(FLASH_LED_PIN, 1);
vTaskDelay(pdMS_TO_TICKS(duration_ms));
gpio_set_level(FLASH_LED_PIN, 0);
/* ---- Classification task ---- */
static void classification_task(void *arg)
/* Allocate tensor arena in PSRAM */
s_tensor_arena = (uint8_t *)heap_caps_malloc(
TENSOR_ARENA_SIZE, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT);
if (s_tensor_arena == NULL) {
ESP_LOGE(TAG, "Failed to allocate tensor arena in PSRAM");
/* Fall back to internal RAM */
s_tensor_arena = (uint8_t *)heap_caps_malloc(
TENSOR_ARENA_SIZE, MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT);
if (s_tensor_arena == NULL) {
ESP_LOGE(TAG, "Failed to allocate tensor arena");
ESP_LOGW(TAG, "Tensor arena allocated in internal RAM");
ESP_LOGI(TAG, "Tensor arena allocated in PSRAM (%d KB)",
TENSOR_ARENA_SIZE / 1024);
/* Initialize TFLite Micro */
const tflite::Model *model =
tflite::GetModel(person_detect_model_tflite);
if (model->version() != TFLITE_SCHEMA_VERSION) {
ESP_LOGE(TAG, "Model schema version mismatch: %lu vs %d",
(unsigned long)model->version(), TFLITE_SCHEMA_VERSION);
static tflite::MicroMutableOpResolver<8> resolver;
resolver.AddFullyConnected();
resolver.AddMean(); /* For GlobalAveragePooling */
resolver.AddDepthwiseConv2D(); /* For MobileNet */
static tflite::MicroInterpreter interpreter(
model, resolver, s_tensor_arena, TENSOR_ARENA_SIZE);
if (interpreter.AllocateTensors() != kTfLiteOk) {
ESP_LOGE(TAG, "AllocateTensors failed");
TfLiteTensor *input = interpreter.input(0);
TfLiteTensor *output = interpreter.output(0);
ESP_LOGI(TAG, "Model loaded. Arena used: %zu / %d bytes",
interpreter.arena_used_bytes(), TENSOR_ARENA_SIZE);
ESP_LOGI(TAG, "Input: [%d, %d, %d, %d], scale=%.6f, zp=%d",
input->dims->data[0], input->dims->data[1],
input->dims->data[2], input->dims->data[3],
input->params.scale, input->params.zero_point);
/* Classification loop */
int64_t total_inference_us = 0;
camera_fb_t *fb = esp_camera_fb_get();
ESP_LOGE(TAG, "Camera capture failed");
vTaskDelay(pdMS_TO_TICKS(100));
/* Verify frame dimensions */
if (fb->width != IMG_WIDTH || fb->height != IMG_HEIGHT) {
ESP_LOGE(TAG, "Unexpected frame size: %dx%d",
esp_camera_fb_return(fb);
/* Preprocess: uint8 [0,255] to int8 [-128,127] */
int8_t *input_data = input->data.int8;
for (size_t i = 0; i < fb->len; i++) {
input_data[i] = (int8_t)(fb->buf[i] - 128);
esp_camera_fb_return(fb);
int64_t t_start = esp_timer_get_time();
TfLiteStatus status = interpreter.Invoke();
int64_t t_end = esp_timer_get_time();
int64_t inference_us = t_end - t_start;
if (status != kTfLiteOk) {
ESP_LOGE(TAG, "Inference failed");
/* Dequantize output and find best class */
float out_scale = output->params.scale;
int out_zp = output->params.zero_point;
int8_t *out_data = output->data.int8;
float best_score = -100.0f;
for (int i = 0; i < NUM_CLASSES; i++) {
float score = (out_data[i] - out_zp) * out_scale;
if (score > best_score) {
total_inference_us += inference_us;
ESP_LOGI(TAG, "[%d] %s (%.2f), Infer: %lld ms",
frame_count, CLASS_LABELS[best_class], best_score,
(long long)(inference_us / 1000));
/* Flash LED on person detection */
if (best_class == 1 && best_score > 0.7f) {
/* Print FPS every 10 frames */
if (frame_count % 10 == 0) {
float avg_ms = (total_inference_us / frame_count) / 1000.0f;
float fps = 1000.0f / avg_ms;
ESP_LOGI(TAG, "Avg inference: %.1f ms (%.1f FPS)",
/* ---- Entry point ---- */
extern "C" void app_main(void)
ESP_LOGI(TAG, "ESP32-CAM Image Classifier starting");
ESP_LOGI(TAG, "Free internal RAM: %zu bytes",
heap_caps_get_free_size(MALLOC_CAP_INTERNAL));
ESP_LOGI(TAG, "Free PSRAM: %zu bytes",
heap_caps_get_free_size(MALLOC_CAP_SPIRAM));
if (camera_init() != ESP_OK) {
ESP_LOGE(TAG, "Camera initialization failed. Halting.");
/* Start classification on core 1 (core 0 handles Wi-Fi if needed) */
xTaskCreatePinnedToCore(classification_task, "classify",
Comments