In the past I have been using a standard algorithm for error checking communication of data between devices. The cyclic redundancy check algorithm, CRC-16 was implemented with a 256 byte lookup table, or more complex and slow calculations. Recently I was reading about a shorter, quicker algorithm based on two 16 word tables.

The C version is for x86 and ARM:

    /*  crc16 - crc16 routine
     *  Thanks to R.K. Irvine, William James Hunt, Rex, & Binstock
     *  This routine returns the crc16 for buf.
     *  crc16 is given by:  x^16 + x^15 + x^2 + 1
    static uint16_t crc16l[] = {
        0x0000, 0xc0c1, 0xc181, 0x0140, 0xc301, 0x03c0, 0x0280, 0xc241,
        0xc601, 0x06c0, 0x0780, 0xc741, 0x0500, 0xc5c1, 0xc481, 0x0440
    static uint16_t crc16h[] = {
        0x0000, 0xcc01, 0xd801, 0x1400, 0xf001, 0x3c00, 0x2800, 0xe401,
        0xa001, 0x6c00, 0x7800, 0xb401, 0x5000, 0x9c01, 0x8801, 0x4400,
    uint16_t crc16(char *buf, int bnobs, uint16_t crc)
        uint8_t n;
        while(bnobs-- > 0)
            n = *buf++ ^ crc;
            crc = (crc >> 8) ^ crc16l[n & 0x0f] ^ crc16h[(n >> 4) & 0x0f];

x86 assembler version:

    ; =============== S U B R O U T I N E =======================================
    ; Call with:    DS:SI = buffer to CRC check
    ;               AX = char
    ;               DX = start CRC
    ; ===========================================================================
    i33crc:                                 ;crc16()
            push bx
            mov bl, al
            xor bl, dl                      ;  n ^= crc
            push bx
            and bl, 0fh
            xor bh, bh
            shl bx, 1                       ;  word index
            mov ax, [cs:crc16l+bx]          ;  crc16l[n&0fh]
            xchg dl, dh                     ;  crc >> 8
            xor dh,dh
            xor dx, ax                      ;  crc = (crc >> 8) ^ crc16l[n&0fh]
            pop bx
            shr bl, 1
            shr bl, 1
            shr bl, 1
            shr bl, 1
            and bl, 0fh
            xor bh, bh
            shl bx, 1
            mov ax, [cs:crc16h+bx]          ;  crc16h[(n>>8) & 0fh]
            xor dx, ax                      ;  ^ (crc >>8)
            mov ax, dx
            pop bx
    ; =============== S U B R O U T I N E =======================================
    ; Call with:    DS:SI = buffer to CRC check
    ;               CX = length of buffer
    ; ===========================================================================
    i33crct:                                ;crc16t()
            push bx
            push dx
            cld                             ; auto inc
            xor ax, ax                      ; n =0;
            xor dx, dx                      ; crc = 0
    .whrxn:                                 ; while rxnobs > 0
            lodsb                           ;  n = *bfr++
            call i33crc
            loop .whrxn
            pop dx
            pop bx
            dw 00000h, 0c0c1h, 0c181h, 00140h, 0c301h, 003c0h, 00280h, 0c241h
            dw 0c601h, 006c0h, 00780h, 0c741h, 00500h, 0c5c1h, 0c481h, 00440h
            dw 00000h, 0cc01h, 0d801h, 01400h, 0f001h, 03c00h, 02800h, 0e401h
            dw 0a001h, 06c00h, 07800h, 0b401h, 05000h, 09c01h, 08801h, 04400h

Z80 assembler version

    ;                                   CRC16t
    ; crc16 => x^16 + x^15 + x^2 + 1
    crctl:      defw 0000h, 0C0C1h, 0C181h, 0140h, 0C301h, 03C0h, 0280h, 0C241h,
                defw 0C601h, 06C0h, 0780h, 0C741h, 0500h, 0C5C1h, 0C481h, 0440h
    crcth:      defw 0000h, 0CC01h, 0D801h, 1400h, 0F001h, 3C00h, 2800h, 0E401h
                defw 0A001h, 6C00h, 7800h, 0B401h, 5000h, 9C01h, 8801h, 4400h
    crc16t:     ld bc, 400h                         ;count = 1024
                ld de, 0                            ;crc = 0
                ld hl, (crcaddr)                    ;hl -> buf
    repeatcrc:  push bc                             ;repeat
                ld ix, crctl                        ; ix -> crc table low
                ld iy, crcth                        ; iy -> crc table high
                ld c, (hl)                          ; n = *buf++
                inc hl
                ld a, e                             ; n ^= crc
                xor c
                ld c, a
                push bc                             ; crc16l[n & 0fh]
                ld b,0
                ld a, 0fh
                and c
                rla                                 ;//times 2 16 bit pointer
                ld c,a
                add ix, bc
                pop bc
                push bc                             ; crc16h[(n >> 4) & 0fh]
                ld b,0
                ld a, c
                and 0fh
                rla                                 ;//times 2 16 bit pointer
                ld c,a
                add iy, bc
                pop bc
                xor a                               ; crc  (low byte) =
                xor (ix+0)                          ;        crc16l[n & 0fh] ^
                xor (iy+0)                          ;        crc16h[(n >> 4) & 0fh] ^
                xor d                               ;        crc >> 8
                ld e,a
                xor a                               ; crc  (high byte) =
                xor (ix+1)                          ;        crc16l[n & 0fh] ^
                xor (iy+1)                          ;        crc16h[(n >> 4) & 0fh] ^
                ld d,a
                pop bc
                dec bc
                ld a, b
                or c
                jp nz, repeatcrc                    ;until block count = 0
                ld (crccalcd), de

The cost of 64 bytes (ROM) for the tables is worth the code simplicity and speed.