------------------------------------------------------------------------
-- 16x16->32-bit unsigned sequential radix-4 multiplier
--   2 bits at a time calculation; almost 100% RTL style
-- Clock, start & stop signals added to use with
--   the adapted GCD testbench
-- A bug fixed: logical shift of 'c' replaced with arithmetic shift
-- L(R)V - 2025
------------------------------------------------------------------------

library IEEE;
use IEEE.std_logic_1164.all;
use IEEE.std_logic_arith.all;

entity multiplier is
  port ( xi, yi: in unsigned(15 downto 0);
         rst    : in std_logic;
         xo     : out unsigned(31 downto 0);
         rdy    : out std_logic;
         clk    : in std_logic);
end entity multiplier;

library IEEE;
use IEEE.std_logic_1164.all;
use IEEE.std_logic_arith.all;

architecture rtl_4_a of multiplier is
  type state is (s0, s1, s2, s3, s4, s5, s6, s7, s8, s9);
  signal curr_state, next_state: state := s0;
  type to_do is (c_keep, c_add, c_add2, c_sub);
  signal oper: to_do;
  signal load_inp, load_out, mul_step: std_logic;
  signal a_bf, b_bf, add_r, add_out: unsigned(15 downto 0);
  signal c_bf: unsigned(31 downto 0);
  signal curr_t, next_t, sub_nadd: std_logic;
begin

  ---------- data path ----------
  -- reg. A
  process begin
    wait on clk until clk='1';
    if  load_inp='1'  then    a_bf <= xi;    end if;
  end process;

  -- reg. B
  process begin
    wait on clk until clk='1';
    if  load_inp='1'  then     b_bf <= yi;
    elsif  mul_step='1'  then
      b_bf <= "00" & b_bf(15 downto 2);
    end if;
  end process;

  -- reg. C
  process begin
    wait on clk until clk='1';
    if  load_inp='1'  then     c_bf <= (others=>'0');
    elsif  mul_step='1'  then
      c_bf <= add_out(15) & add_out(15) & add_out & c_bf(15 downto 2);
    end if;
  end process;

  -- reg. T
  process begin
    wait on clk until clk='1';
    if  load_inp='1'  then       curr_t <= '0';
    elsif  mul_step='1'  then    curr_t <= next_t;
    end if;
  end process;

  -- reg. C (output)
  process begin
    wait on clk until clk='1';
    if  load_out='1'  then    xo <= c_bf;    end if;
  end process;

  -- conditions
  process (b_bf, curr_t)
    variable b_bits: std_logic_vector(2 downto 0);
  begin
    b_bits := curr_t & std_logic_vector(b_bf(1 downto 0));
    case b_bits is
    when "000" =>  next_t <= '0';  oper <= c_keep;  sub_nadd <= '-';
    when "001" =>  next_t <= '0';  oper <= c_add;   sub_nadd <= '0';
    when "010" =>  next_t <= '0';  oper <= c_add2;  sub_nadd <= '0';
    when "011" =>  next_t <= '1';  oper <= c_sub;   sub_nadd <= '1';
    when "100" =>  next_t <= '0';  oper <= c_add;   sub_nadd <= '0';
    when "101" =>  next_t <= '0';  oper <= c_add2;  sub_nadd <= '0';
    when "110" =>  next_t <= '1';  oper <= c_sub;   sub_nadd <= '1';
    when others => next_t <= '1';  oper <= c_keep;  sub_nadd <= '-';
    end case;
  end process;

  -- adder/subtracter & multiplexers
  process (oper, a_bf, c_bf) begin
    case oper is
    when c_keep =>  add_out <= c_bf(31 downto 16);
    when c_add =>   add_out <= c_bf(31 downto 16) + a_bf;
    when c_add2 =>  add_out <= c_bf(31 downto 16) + (a_bf(14 downto 0) & '0');
    when c_sub =>   add_out <= c_bf(31 downto 16) - a_bf;
    end case;
  end process;

  ---------- controller (FSM) ----------
  -- next state & output functions
  process (curr_state, rst) begin
    load_inp <= '0';    mul_step <= '0';    load_out <= '0';    rdy <= '0';
    case curr_state is
      when s0 =>  if rst='0' then
          next_state <= s1;    load_inp <= '1';   rdy <= '0';
        end if;
      when s1 =>  next_state <= s2;    mul_step <= '1';
      when s2 =>  next_state <= s3;    mul_step <= '1';
      when s3 =>  next_state <= s4;    mul_step <= '1';
      when s4 =>  next_state <= s5;    mul_step <= '1';
      when s5 =>  next_state <= s6;    mul_step <= '1';
      when s6 =>  next_state <= s7;    mul_step <= '1';
      when s7 =>  next_state <= s8;    mul_step <= '1';
      when s8 =>  next_state <= s9;    mul_step <= '1';
      when s9 =>  next_state <= s0;    load_out <= '1';    rdy <= '1';
    end case;
  end process;
  -- state register
  process begin
    wait on clk until clk='1';
    curr_state <= next_state;
  end process;

end architecture rtl_4_a;
