------------------------------------------------------------------------
-- 8x8->16-bit unsigned sequential radix-4 multiplier
--   2 bits at a time calculation; no start/ready signals
--   slightly buggy algorithm but it still works...
--   pure 100% RTL style, OK for Xilinx ISE (10.1)
-- L(R)V - 2009
------------------------------------------------------------------------

library IEEE;
use IEEE.std_logic_1164.all;
use IEEE.std_logic_arith.all;

entity multiplier7 is
  port ( clk: in bit;
         a, b: in unsigned(7 downto 0);
         c: out unsigned(15 downto 0) );
end entity multiplier7;

library IEEE;
use IEEE.std_logic_1164.all;
use IEEE.std_logic_arith.all;

architecture rtl of multiplier7 is
  type state is (s0, s1, s2, s3, s4, s5);
  signal curr_state, next_state: state := s0;
  type to_do is (c_keep, c_add, c_add2, c_sub);
  signal oper: to_do;
  signal load_inp, load_out, mul_step: bit;
--  signal a_bf, b_bf, add_out, c_shift: unsigned(7 downto 0);
  signal a_bf, b_bf, add_r, add_out: unsigned(7 downto 0);
  signal c_bf: unsigned(15 downto 0);
  signal curr_t, next_t, sub_nadd: std_logic;
begin

  ---------- data path ----------
  -- reg. A
  process begin
    wait on clk until clk='1';
    if  load_inp='1'  then    a_bf <= a;    end if;
  end process;

  -- reg. B
  process begin
    wait on clk until clk='1';
    if  load_inp='1'  then     b_bf <= b;
    elsif  mul_step='1'  then
      b_bf <= "00" & b_bf(7 downto 2);
    end if;
  end process;

  -- reg. C
  process begin
    wait on clk until clk='1';
    if  load_inp='1'  then     c_bf <= (others=>'0');
    elsif  mul_step='1'  then
      c_bf <= add_out & c_bf(9 downto 2);
    end if;
  end process;

  -- reg. T
  process begin
    wait on clk until clk='1';
    if  load_inp='1'  then       curr_t <= '0';
    elsif  mul_step='1'  then    curr_t <= next_t;
    end if;
  end process;

  -- reg. C (output)
  process begin
    wait on clk until clk='1';
    if  load_out='1'  then    c <= c_bf;    end if;
  end process;

  -- conditions
  process (b_bf, curr_t)
    variable b_bits: std_logic_vector(2 downto 0);
  begin
    b_bits := curr_t & std_logic_vector(b_bf(1 downto 0));
    case b_bits is
--    when "000" =>  next_t <= '0';  oper <= c_keep;
--    when "001" =>  next_t <= '0';  oper <= c_add;
--    when "010" =>  next_t <= '0';  oper <= c_add2;
--    when "011" =>  next_t <= '1';  oper <= c_sub;
--    when "100" =>  next_t <= '0';  oper <= c_add;
--    when "101" =>  next_t <= '0';  oper <= c_add2;
--    when "110" =>  next_t <= '1';  oper <= c_sub;
--    when others => next_t <= '1';  oper <= c_keep;
    when "000" =>  next_t <= '0';  oper <= c_keep;  sub_nadd <= '0';
    when "001" =>  next_t <= '0';  oper <= c_add;   sub_nadd <= '0';
    when "010" =>  next_t <= '0';  oper <= c_add2;  sub_nadd <= '0';
    when "011" =>  next_t <= '1';  oper <= c_sub;   sub_nadd <= '1';
    when "100" =>  next_t <= '0';  oper <= c_add;   sub_nadd <= '0';
    when "101" =>  next_t <= '0';  oper <= c_add2;  sub_nadd <= '0';
    when "110" =>  next_t <= '1';  oper <= c_sub;   sub_nadd <= '1';
    when others => next_t <= '1';  oper <= c_keep;  sub_nadd <= '0';
    end case;
  end process;

  -- adder/subtracter & multiplexers
--  c_shift <= "00" & c_bf(15 downto 10);
--  process (oper, a_bf, c_shift) begin
--    case oper is
--    when c_keep =>  add_out <= c_shift;
--    when c_add =>   add_out <= c_shift + a_bf;
--    when c_add2 =>  add_out <= c_shift + (a_bf(6 downto 0) & '0');
--    when c_sub =>   add_out <= c_shift - a_bf;
--    end case;
--  end process;

  -- multiplexers
  process (oper, a_bf) begin
    case oper is
    when c_keep =>  add_r <= (others=>'0');
    when c_add =>   add_r <= a_bf;
    when c_add2 =>  add_r <= a_bf(6 downto 0) & '0';
    when c_sub =>   add_r <= unsigned(not std_logic_vector(a_bf));
    end case;
  end process;

  -- adder/subtracter
  process (c_bf, add_r, sub_nadd)
    variable c_bf_tmp, add_r_tmp, add_out_tmp: unsigned(8 downto 0);
  begin
--    c_bf_tmp := c_bf(15 downto 8) & '1';
    c_bf_tmp := "00" & c_bf(15 downto 10) & '1';
    add_r_tmp := add_r & sub_nadd;
    add_out_tmp := c_bf_tmp + add_r_tmp;
    add_out <= add_out_tmp(8 downto 1);
  end process;

  ---------- controller (FSM) ----------
  -- next state & output functions
  process (curr_state) begin
    load_inp <= '0';    mul_step <= '0';    load_out <= '0';
    case curr_state is
    when s0 =>  next_state <= s1;    load_inp <= '1';
    when s1 =>  next_state <= s2;    mul_step <= '1';
    when s2 =>  next_state <= s3;    mul_step <= '1';
    when s3 =>  next_state <= s4;    mul_step <= '1';
    when s4 =>  next_state <= s5;    mul_step <= '1';
    when s5 =>  next_state <= s0;    load_out <= '1';
    end case;
  end process;
  -- state register
  process begin
    wait on clk until clk='1';
    curr_state <= next_state;
  end process;

end architecture rtl;
