/* GNU/Linux/AArch64 specific low level interface, for the remote server for
   GDB.

   Copyright (C) 2009-2015 Free Software Foundation, Inc.
   Contributed by ARM Ltd.

   This file is part of GDB.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */

#include "server.h"
#include "linux-low.h"
#include "nat/aarch64-linux.h"
#include "nat/aarch64-linux-hw-point.h"
#include "arch/aarch64-insn.h"
#include "linux-aarch32-low.h"
#include "elf/common.h"
#include "ax.h"
#include "tracepoint.h"

#include <signal.h>
#include <sys/user.h>
#include "nat/gdb_ptrace.h"
#include <asm/ptrace.h>
#include <inttypes.h>
#include <endian.h>
#include <sys/uio.h>

#include "gdb_proc_service.h"

/* Defined in auto-generated files.  */
void init_registers_aarch64 (void);
extern const struct target_desc *tdesc_aarch64;

#ifdef HAVE_SYS_REG_H
#include <sys/reg.h>
#endif

#define AARCH64_X_REGS_NUM 31
#define AARCH64_V_REGS_NUM 32
#define AARCH64_X0_REGNO    0
#define AARCH64_SP_REGNO   31
#define AARCH64_PC_REGNO   32
#define AARCH64_CPSR_REGNO 33
#define AARCH64_V0_REGNO   34
#define AARCH64_FPSR_REGNO (AARCH64_V0_REGNO + AARCH64_V_REGS_NUM)
#define AARCH64_FPCR_REGNO (AARCH64_V0_REGNO + AARCH64_V_REGS_NUM + 1)

#define AARCH64_NUM_REGS (AARCH64_V0_REGNO + AARCH64_V_REGS_NUM + 2)

/* Per-process arch-specific data we want to keep.  */

struct arch_process_info
{
  /* Hardware breakpoint/watchpoint data.
     The reason for them to be per-process rather than per-thread is
     due to the lack of information in the gdbserver environment;
     gdbserver is not told that whether a requested hardware
     breakpoint/watchpoint is thread specific or not, so it has to set
     each hw bp/wp for every thread in the current process.  The
     higher level bp/wp management in gdb will resume a thread if a hw
     bp/wp trap is not expected for it.  Since the hw bp/wp setting is
     same for each thread, it is reasonable for the data to live here.
     */
  struct aarch64_debug_reg_state debug_reg_state;
};

/* Return true if the size of register 0 is 8 byte.  */

static int
is_64bit_tdesc (void)
{
  struct regcache *regcache = get_thread_regcache (current_thread, 0);

  return register_size (regcache->tdesc, 0) == 8;
}

/* Implementation of linux_target_ops method "cannot_store_register".  */

static int
aarch64_cannot_store_register (int regno)
{
  return regno >= AARCH64_NUM_REGS;
}

/* Implementation of linux_target_ops method "cannot_fetch_register".  */

static int
aarch64_cannot_fetch_register (int regno)
{
  return regno >= AARCH64_NUM_REGS;
}

static void
aarch64_fill_gregset (struct regcache *regcache, void *buf)
{
  struct user_pt_regs *regset = buf;
  int i;

  for (i = 0; i < AARCH64_X_REGS_NUM; i++)
    collect_register (regcache, AARCH64_X0_REGNO + i, &regset->regs[i]);
  collect_register (regcache, AARCH64_SP_REGNO, &regset->sp);
  collect_register (regcache, AARCH64_PC_REGNO, &regset->pc);
  collect_register (regcache, AARCH64_CPSR_REGNO, &regset->pstate);
}

static void
aarch64_store_gregset (struct regcache *regcache, const void *buf)
{
  const struct user_pt_regs *regset = buf;
  int i;

  for (i = 0; i < AARCH64_X_REGS_NUM; i++)
    supply_register (regcache, AARCH64_X0_REGNO + i, &regset->regs[i]);
  supply_register (regcache, AARCH64_SP_REGNO, &regset->sp);
  supply_register (regcache, AARCH64_PC_REGNO, &regset->pc);
  supply_register (regcache, AARCH64_CPSR_REGNO, &regset->pstate);
}

static void
aarch64_fill_fpregset (struct regcache *regcache, void *buf)
{
  struct user_fpsimd_state *regset = buf;
  int i;

  for (i = 0; i < AARCH64_V_REGS_NUM; i++)
    collect_register (regcache, AARCH64_V0_REGNO + i, &regset->vregs[i]);
  collect_register (regcache, AARCH64_FPSR_REGNO, &regset->fpsr);
  collect_register (regcache, AARCH64_FPCR_REGNO, &regset->fpcr);
}

static void
aarch64_store_fpregset (struct regcache *regcache, const void *buf)
{
  const struct user_fpsimd_state *regset = buf;
  int i;

  for (i = 0; i < AARCH64_V_REGS_NUM; i++)
    supply_register (regcache, AARCH64_V0_REGNO + i, &regset->vregs[i]);
  supply_register (regcache, AARCH64_FPSR_REGNO, &regset->fpsr);
  supply_register (regcache, AARCH64_FPCR_REGNO, &regset->fpcr);
}

/* Enable miscellaneous debugging output.  The name is historical - it
   was originally used to debug LinuxThreads support.  */
extern int debug_threads;

/* Implementation of linux_target_ops method "get_pc".  */

static CORE_ADDR
aarch64_get_pc (struct regcache *regcache)
{
  if (register_size (regcache->tdesc, 0) == 8)
    {
      unsigned long pc;

      collect_register_by_name (regcache, "pc", &pc);
      if (debug_threads)
	debug_printf ("stop pc is %08lx\n", pc);
      return pc;
    }
  else
    {
      unsigned int pc;

      collect_register_by_name (regcache, "pc", &pc);
      if (debug_threads)
	debug_printf ("stop pc is %04x\n", pc);
      return pc;
    }
}

/* Implementation of linux_target_ops method "set_pc".  */

static void
aarch64_set_pc (struct regcache *regcache, CORE_ADDR pc)
{
  if (register_size (regcache->tdesc, 0) == 8)
    {
      unsigned long newpc = pc;
      supply_register_by_name (regcache, "pc", &newpc);
    }
  else
    {
      unsigned int newpc = pc;
      supply_register_by_name (regcache, "pc", &newpc);
    }
}

#define aarch64_breakpoint_len 4

/* AArch64 BRK software debug mode instruction.
   This instruction needs to match gdb/aarch64-tdep.c
   (aarch64_default_breakpoint).  */
static const gdb_byte aarch64_breakpoint[] = {0x00, 0x00, 0x20, 0xd4};

/* Implementation of linux_target_ops method "breakpoint_at".  */

static int
aarch64_breakpoint_at (CORE_ADDR where)
{
  gdb_byte insn[aarch64_breakpoint_len];

  (*the_target->read_memory) (where, (unsigned char *) &insn,
			      aarch64_breakpoint_len);
  if (memcmp (insn, aarch64_breakpoint, aarch64_breakpoint_len) == 0)
    return 1;

  return 0;
}

static void
aarch64_init_debug_reg_state (struct aarch64_debug_reg_state *state)
{
  int i;

  for (i = 0; i < AARCH64_HBP_MAX_NUM; ++i)
    {
      state->dr_addr_bp[i] = 0;
      state->dr_ctrl_bp[i] = 0;
      state->dr_ref_count_bp[i] = 0;
    }

  for (i = 0; i < AARCH64_HWP_MAX_NUM; ++i)
    {
      state->dr_addr_wp[i] = 0;
      state->dr_ctrl_wp[i] = 0;
      state->dr_ref_count_wp[i] = 0;
    }
}

/* Return the pointer to the debug register state structure in the
   current process' arch-specific data area.  */

struct aarch64_debug_reg_state *
aarch64_get_debug_reg_state (pid_t pid)
{
  struct process_info *proc = find_process_pid (pid);

  return &proc->priv->arch_private->debug_reg_state;
}

/* Implementation of linux_target_ops method "supports_z_point_type".  */

static int
aarch64_supports_z_point_type (char z_type)
{
  switch (z_type)
    {
    case Z_PACKET_SW_BP:
      {
	if (!extended_protocol && is_64bit_tdesc ())
	  {
	    /* Only enable Z0 packet in non-multi-arch debugging.  If
	       extended protocol is used, don't enable Z0 packet because
	       GDBserver may attach to 32-bit process.  */
	    return 1;
	  }
	else
	  {
	    /* Disable Z0 packet so that GDBserver doesn't have to handle
	       different breakpoint instructions (aarch64, arm, thumb etc)
	       in multi-arch debugging.  */
	    return 0;
	  }
      }
    case Z_PACKET_HW_BP:
    case Z_PACKET_WRITE_WP:
    case Z_PACKET_READ_WP:
    case Z_PACKET_ACCESS_WP:
      return 1;
    default:
      return 0;
    }
}

/* Implementation of linux_target_ops method "insert_point".

   It actually only records the info of the to-be-inserted bp/wp;
   the actual insertion will happen when threads are resumed.  */

static int
aarch64_insert_point (enum raw_bkpt_type type, CORE_ADDR addr,
		      int len, struct raw_breakpoint *bp)
{
  int ret;
  enum target_hw_bp_type targ_type;
  struct aarch64_debug_reg_state *state
    = aarch64_get_debug_reg_state (pid_of (current_thread));

  if (show_debug_regs)
    fprintf (stderr, "insert_point on entry (addr=0x%08lx, len=%d)\n",
	     (unsigned long) addr, len);

  /* Determine the type from the raw breakpoint type.  */
  targ_type = raw_bkpt_type_to_target_hw_bp_type (type);

  if (targ_type != hw_execute)
    {
      if (aarch64_linux_region_ok_for_watchpoint (addr, len))
	ret = aarch64_handle_watchpoint (targ_type, addr, len,
					 1 /* is_insert */, state);
      else
	ret = -1;
    }
  else
    ret =
      aarch64_handle_breakpoint (targ_type, addr, len, 1 /* is_insert */,
				 state);

  if (show_debug_regs)
    aarch64_show_debug_reg_state (state, "insert_point", addr, len,
				  targ_type);

  return ret;
}

/* Implementation of linux_target_ops method "remove_point".

   It actually only records the info of the to-be-removed bp/wp,
   the actual removal will be done when threads are resumed.  */

static int
aarch64_remove_point (enum raw_bkpt_type type, CORE_ADDR addr,
		      int len, struct raw_breakpoint *bp)
{
  int ret;
  enum target_hw_bp_type targ_type;
  struct aarch64_debug_reg_state *state
    = aarch64_get_debug_reg_state (pid_of (current_thread));

  if (show_debug_regs)
    fprintf (stderr, "remove_point on entry (addr=0x%08lx, len=%d)\n",
	     (unsigned long) addr, len);

  /* Determine the type from the raw breakpoint type.  */
  targ_type = raw_bkpt_type_to_target_hw_bp_type (type);

  /* Set up state pointers.  */
  if (targ_type != hw_execute)
    ret =
      aarch64_handle_watchpoint (targ_type, addr, len, 0 /* is_insert */,
				 state);
  else
    ret =
      aarch64_handle_breakpoint (targ_type, addr, len, 0 /* is_insert */,
				 state);

  if (show_debug_regs)
    aarch64_show_debug_reg_state (state, "remove_point", addr, len,
				  targ_type);

  return ret;
}

/* Implementation of linux_target_ops method "stopped_data_address".  */

static CORE_ADDR
aarch64_stopped_data_address (void)
{
  siginfo_t siginfo;
  int pid, i;
  struct aarch64_debug_reg_state *state;

  pid = lwpid_of (current_thread);

  /* Get the siginfo.  */
  if (ptrace (PTRACE_GETSIGINFO, pid, NULL, &siginfo) != 0)
    return (CORE_ADDR) 0;

  /* Need to be a hardware breakpoint/watchpoint trap.  */
  if (siginfo.si_signo != SIGTRAP
      || (siginfo.si_code & 0xffff) != 0x0004 /* TRAP_HWBKPT */)
    return (CORE_ADDR) 0;

  /* Check if the address matches any watched address.  */
  state = aarch64_get_debug_reg_state (pid_of (current_thread));
  for (i = aarch64_num_wp_regs - 1; i >= 0; --i)
    {
      const unsigned int len = aarch64_watchpoint_length (state->dr_ctrl_wp[i]);
      const CORE_ADDR addr_trap = (CORE_ADDR) siginfo.si_addr;
      const CORE_ADDR addr_watch = state->dr_addr_wp[i];
      if (state->dr_ref_count_wp[i]
	  && DR_CONTROL_ENABLED (state->dr_ctrl_wp[i])
	  && addr_trap >= addr_watch
	  && addr_trap < addr_watch + len)
	return addr_trap;
    }

  return (CORE_ADDR) 0;
}

/* Implementation of linux_target_ops method "stopped_by_watchpoint".  */

static int
aarch64_stopped_by_watchpoint (void)
{
  if (aarch64_stopped_data_address () != 0)
    return 1;
  else
    return 0;
}

/* Fetch the thread-local storage pointer for libthread_db.  */

ps_err_e
ps_get_thread_area (const struct ps_prochandle *ph,
		    lwpid_t lwpid, int idx, void **base)
{
  return aarch64_ps_get_thread_area (ph, lwpid, idx, base,
				     is_64bit_tdesc ());
}

/* Implementation of linux_target_ops method "siginfo_fixup".  */

static int
aarch64_linux_siginfo_fixup (siginfo_t *native, void *inf, int direction)
{
  /* Is the inferior 32-bit?  If so, then fixup the siginfo object.  */
  if (!is_64bit_tdesc ())
    {
      if (direction == 0)
	aarch64_compat_siginfo_from_siginfo ((struct compat_siginfo *) inf,
					     native);
      else
	aarch64_siginfo_from_compat_siginfo (native,
					     (struct compat_siginfo *) inf);

      return 1;
    }

  return 0;
}

/* Implementation of linux_target_ops method "linux_new_process".  */

static struct arch_process_info *
aarch64_linux_new_process (void)
{
  struct arch_process_info *info = XCNEW (struct arch_process_info);

  aarch64_init_debug_reg_state (&info->debug_reg_state);

  return info;
}

/* Implementation of linux_target_ops method "linux_new_fork".  */

static void
aarch64_linux_new_fork (struct process_info *parent,
			struct process_info *child)
{
  /* These are allocated by linux_add_process.  */
  gdb_assert (parent->priv != NULL
	      && parent->priv->arch_private != NULL);
  gdb_assert (child->priv != NULL
	      && child->priv->arch_private != NULL);

  /* Linux kernel before 2.6.33 commit
     72f674d203cd230426437cdcf7dd6f681dad8b0d
     will inherit hardware debug registers from parent
     on fork/vfork/clone.  Newer Linux kernels create such tasks with
     zeroed debug registers.

     GDB core assumes the child inherits the watchpoints/hw
     breakpoints of the parent, and will remove them all from the
     forked off process.  Copy the debug registers mirrors into the
     new process so that all breakpoints and watchpoints can be
     removed together.  The debug registers mirror will become zeroed
     in the end before detaching the forked off process, thus making
     this compatible with older Linux kernels too.  */

  *child->priv->arch_private = *parent->priv->arch_private;
}

/* Return the right target description according to the ELF file of
   current thread.  */

static const struct target_desc *
aarch64_linux_read_description (void)
{
  unsigned int machine;
  int is_elf64;
  int tid;

  tid = lwpid_of (current_thread);

  is_elf64 = linux_pid_exe_is_elf_64_file (tid, &machine);

  if (is_elf64)
    return tdesc_aarch64;
  else
    return tdesc_arm_with_neon;
}

/* Implementation of linux_target_ops method "arch_setup".  */

static void
aarch64_arch_setup (void)
{
  current_process ()->tdesc = aarch64_linux_read_description ();

  aarch64_linux_get_debug_reg_capacity (lwpid_of (current_thread));
}

static struct regset_info aarch64_regsets[] =
{
  { PTRACE_GETREGSET, PTRACE_SETREGSET, NT_PRSTATUS,
    sizeof (struct user_pt_regs), GENERAL_REGS,
    aarch64_fill_gregset, aarch64_store_gregset },
  { PTRACE_GETREGSET, PTRACE_SETREGSET, NT_FPREGSET,
    sizeof (struct user_fpsimd_state), FP_REGS,
    aarch64_fill_fpregset, aarch64_store_fpregset
  },
  { 0, 0, 0, -1, -1, NULL, NULL }
};

static struct regsets_info aarch64_regsets_info =
  {
    aarch64_regsets, /* regsets */
    0, /* num_regsets */
    NULL, /* disabled_regsets */
  };

static struct regs_info regs_info_aarch64 =
  {
    NULL, /* regset_bitmap */
    NULL, /* usrregs */
    &aarch64_regsets_info,
  };

/* Implementation of linux_target_ops method "regs_info".  */

static const struct regs_info *
aarch64_regs_info (void)
{
  if (is_64bit_tdesc ())
    return &regs_info_aarch64;
  else
    return &regs_info_aarch32;
}

/* Implementation of linux_target_ops method "supports_tracepoints".  */

static int
aarch64_supports_tracepoints (void)
{
  if (current_thread == NULL)
    return 1;
  else
    {
      /* We don't support tracepoints on aarch32 now.  */
      return is_64bit_tdesc ();
    }
}

/* Implementation of linux_target_ops method "get_thread_area".  */

static int
aarch64_get_thread_area (int lwpid, CORE_ADDR *addrp)
{
  struct iovec iovec;
  uint64_t reg;

  iovec.iov_base = &reg;
  iovec.iov_len = sizeof (reg);

  if (ptrace (PTRACE_GETREGSET, lwpid, NT_ARM_TLS, &iovec) != 0)
    return -1;

  *addrp = reg;

  return 0;
}

/* Extract a signed value from a bit field within an instruction
   encoding.

   INSN is the instruction opcode.

   WIDTH specifies the width of the bit field to extract (in bits).

   OFFSET specifies the least significant bit of the field where bits
   are numbered zero counting from least to most significant.  */

static int32_t
extract_signed_bitfield (uint32_t insn, unsigned width, unsigned offset)
{
  unsigned shift_l = sizeof (int32_t) * 8 - (offset + width);
  unsigned shift_r = sizeof (int32_t) * 8 - width;

  return ((int32_t) insn << shift_l) >> shift_r;
}

/* Decode an opcode if it represents an LDR or LDRSW instruction taking a
   literal offset from the current PC.

   ADDR specifies the address of the opcode.
   INSN specifies the opcode to test.
   IS_W is set if the instruction is LDRSW.
   IS64 receives size field from the decoded instruction.
   RT receives the 'rt' field from the decoded instruction.
   OFFSET receives the 'imm' field from the decoded instruction.

   Return 1 if the opcodes matches and is decoded, otherwise 0.  */

int
aarch64_decode_ldr_literal (CORE_ADDR addr, uint32_t insn, int *is_w,
			    int *is64, unsigned *rt, int32_t *offset)
{
  /* LDR    0T01 1000 iiii iiii iiii iiii iiir rrrr */
  /* LDRSW  1001 1000 iiii iiii iiii iiii iiir rrrr */
  if ((insn & 0x3f000000) == 0x18000000)
    {
      *is_w = (insn >> 31) & 0x1;

      if (*is_w)
	{
	  /* LDRSW always takes a 64-bit destination registers.  */
	  *is64 = 1;
	}
      else
	*is64 = (insn >> 30) & 0x1;

      *rt = (insn >> 0) & 0x1f;
      *offset = extract_signed_bitfield (insn, 19, 5) << 2;

      if (aarch64_debug)
	debug_printf ("decode: %s 0x%x %s %s%u, #?\n",
		      core_addr_to_string_nz (addr), insn,
		      *is_w ? "ldrsw" : "ldr",
		      *is64 ? "x" : "w", *rt);

      return 1;
    }

  return 0;
}

/* List of opcodes that we need for building the jump pad and relocating
   an instruction.  */

enum aarch64_opcodes
{
  /* B              0001 01ii iiii iiii iiii iiii iiii iiii */
  /* BL             1001 01ii iiii iiii iiii iiii iiii iiii */
  /* B.COND         0101 0100 iiii iiii iiii iiii iii0 cccc */
  /* CBZ            s011 0100 iiii iiii iiii iiii iiir rrrr */
  /* CBNZ           s011 0101 iiii iiii iiii iiii iiir rrrr */
  /* TBZ            b011 0110 bbbb biii iiii iiii iiir rrrr */
  /* TBNZ           b011 0111 bbbb biii iiii iiii iiir rrrr */
  B               = 0x14000000,
  BL              = 0x80000000 | B,
  BCOND           = 0x40000000 | B,
  CBZ             = 0x20000000 | B,
  CBNZ            = 0x21000000 | B,
  TBZ             = 0x36000000 | B,
  TBNZ            = 0x37000000 | B,
  /* BLR            1101 0110 0011 1111 0000 00rr rrr0 0000 */
  BLR             = 0xd63f0000,
  /* RET            1101 0110 0101 1111 0000 00rr rrr0 0000 */
  RET             = 0xd65f0000,
  /* STP            s010 100o o0ii iiii irrr rrrr rrrr rrrr */
  /* LDP            s010 100o o1ii iiii irrr rrrr rrrr rrrr */
  /* STP (SIMD&VFP) ss10 110o o0ii iiii irrr rrrr rrrr rrrr */
  /* LDP (SIMD&VFP) ss10 110o o1ii iiii irrr rrrr rrrr rrrr */
  STP             = 0x28000000,
  LDP             = 0x28400000,
  STP_SIMD_VFP    = 0x04000000 | STP,
  LDP_SIMD_VFP    = 0x04000000 | LDP,
  /* STR            ss11 100o 00xi iiii iiii xxrr rrrr rrrr */
  /* LDR            ss11 100o 01xi iiii iiii xxrr rrrr rrrr */
  /* LDRSW          1011 100o 10xi iiii iiii xxrr rrrr rrrr */
  STR             = 0x38000000,
  LDR             = 0x00400000 | STR,
  LDRSW           = 0x80800000 | STR,
  /* LDAXR          ss00 1000 0101 1111 1111 11rr rrrr rrrr */
  LDAXR           = 0x085ffc00,
  /* STXR           ss00 1000 000r rrrr 0111 11rr rrrr rrrr */
  STXR            = 0x08007c00,
  /* STLR           ss00 1000 1001 1111 1111 11rr rrrr rrrr */
  STLR            = 0x089ffc00,
  /* MOV            s101 0010 1xxi iiii iiii iiii iiir rrrr */
  /* MOVK           s111 0010 1xxi iiii iiii iiii iiir rrrr */
  MOV             = 0x52800000,
  MOVK            = 0x20000000 | MOV,
  /* ADD            s00o ooo1 xxxx xxxx xxxx xxxx xxxx xxxx */
  /* SUB            s10o ooo1 xxxx xxxx xxxx xxxx xxxx xxxx */
  /* SUBS           s11o ooo1 xxxx xxxx xxxx xxxx xxxx xxxx */
  ADD             = 0x01000000,
  SUB             = 0x40000000 | ADD,
  SUBS            = 0x20000000 | SUB,
  /* AND            s000 1010 xx0x xxxx xxxx xxxx xxxx xxxx */
  /* ORR            s010 1010 xx0x xxxx xxxx xxxx xxxx xxxx */
  /* ORN            s010 1010 xx1x xxxx xxxx xxxx xxxx xxxx */
  /* EOR            s100 1010 xx0x xxxx xxxx xxxx xxxx xxxx */
  AND             = 0x0a000000,
  ORR             = 0x20000000 | AND,
  ORN             = 0x00200000 | ORR,
  EOR             = 0x40000000 | AND,
  /* LSLV           s001 1010 110r rrrr 0010 00rr rrrr rrrr */
  /* LSRV           s001 1010 110r rrrr 0010 01rr rrrr rrrr */
  /* ASRV           s001 1010 110r rrrr 0010 10rr rrrr rrrr */
  LSLV             = 0x1ac02000,
  LSRV             = 0x00000400 | LSLV,
  ASRV             = 0x00000800 | LSLV,
  /* SBFM           s001 0011 0nii iiii iiii iirr rrrr rrrr */
  SBFM            = 0x13000000,
  /* UBFM           s101 0011 0nii iiii iiii iirr rrrr rrrr */
  UBFM            = 0x40000000 | SBFM,
  /* CSINC          s001 1010 100r rrrr cccc 01rr rrrr rrrr */
  CSINC           = 0x9a800400,
  /* MUL            s001 1011 000r rrrr 0111 11rr rrrr rrrr */
  MUL             = 0x1b007c00,
  /* MSR (register) 1101 0101 0001 oooo oooo oooo ooor rrrr */
  /* MRS            1101 0101 0011 oooo oooo oooo ooor rrrr */
  MSR             = 0xd5100000,
  MRS             = 0x00200000 | MSR,
  /* HINT           1101 0101 0000 0011 0010 oooo ooo1 1111 */
  HINT            = 0xd503201f,
  SEVL            = (5 << 5) | HINT,
  WFE             = (2 << 5) | HINT,
  NOP             = (0 << 5) | HINT,
};

/* List of condition codes that we need.  */

enum aarch64_condition_codes
{
  EQ = 0x0,
  NE = 0x1,
  LO = 0x3,
  GE = 0xa,
  LT = 0xb,
  GT = 0xc,
  LE = 0xd,
};

/* Representation of a general purpose register of the form xN or wN.

   This type is used by emitting functions that take registers as operands.  */

struct aarch64_register
{
  unsigned num;
  int is64;
};

/* Representation of an operand.  At this time, it only supports register
   and immediate types.  */

struct aarch64_operand
{
  /* Type of the operand.  */
  enum
    {
      OPERAND_IMMEDIATE,
      OPERAND_REGISTER,
    } type;
  /* Value of the operand according to the type.  */
  union
    {
      uint32_t imm;
      struct aarch64_register reg;
    };
};

/* List of registers that we are currently using, we can add more here as
   we need to use them.  */

/* General purpose scratch registers (64 bit).  */
static const struct aarch64_register x0 = { 0, 1 };
static const struct aarch64_register x1 = { 1, 1 };
static const struct aarch64_register x2 = { 2, 1 };
static const struct aarch64_register x3 = { 3, 1 };
static const struct aarch64_register x4 = { 4, 1 };

/* General purpose scratch registers (32 bit).  */
static const struct aarch64_register w0 = { 0, 0 };
static const struct aarch64_register w2 = { 2, 0 };

/* Intra-procedure scratch registers.  */
static const struct aarch64_register ip0 = { 16, 1 };

/* Special purpose registers.  */
static const struct aarch64_register fp = { 29, 1 };
static const struct aarch64_register lr = { 30, 1 };
static const struct aarch64_register sp = { 31, 1 };
static const struct aarch64_register xzr = { 31, 1 };

/* Dynamically allocate a new register.  If we know the register
   statically, we should make it a global as above instead of using this
   helper function.  */

static struct aarch64_register
aarch64_register (unsigned num, int is64)
{
  return (struct aarch64_register) { num, is64 };
}

/* Helper function to create a register operand, for instructions with
   different types of operands.

   For example:
   p += emit_mov (p, x0, register_operand (x1));  */

static struct aarch64_operand
register_operand (struct aarch64_register reg)
{
  struct aarch64_operand operand;

  operand.type = OPERAND_REGISTER;
  operand.reg = reg;

  return operand;
}

/* Helper function to create an immediate operand, for instructions with
   different types of operands.

   For example:
   p += emit_mov (p, x0, immediate_operand (12));  */

static struct aarch64_operand
immediate_operand (uint32_t imm)
{
  struct aarch64_operand operand;

  operand.type = OPERAND_IMMEDIATE;
  operand.imm = imm;

  return operand;
}

/* Representation of a memory operand, used for load and store
   instructions.

   The types correspond to the following variants:

   MEMORY_OPERAND_OFFSET:    LDR rt, [rn, #offset]
   MEMORY_OPERAND_PREINDEX:  LDR rt, [rn, #index]!
   MEMORY_OPERAND_POSTINDEX: LDR rt, [rn], #index  */

struct aarch64_memory_operand
{
  /* Type of the operand.  */
  enum
    {
      MEMORY_OPERAND_OFFSET,
      MEMORY_OPERAND_PREINDEX,
      MEMORY_OPERAND_POSTINDEX,
    } type;
  /* Index from the base register.  */
  int32_t index;
};

/* Helper function to create an offset memory operand.

   For example:
   p += emit_ldr (p, x0, sp, offset_memory_operand (16));  */

static struct aarch64_memory_operand
offset_memory_operand (int32_t offset)
{
  return (struct aarch64_memory_operand) { MEMORY_OPERAND_OFFSET, offset };
}

/* Helper function to create a pre-index memory operand.

   For example:
   p += emit_ldr (p, x0, sp, preindex_memory_operand (16));  */

static struct aarch64_memory_operand
preindex_memory_operand (int32_t index)
{
  return (struct aarch64_memory_operand) { MEMORY_OPERAND_PREINDEX, index };
}

/* Helper function to create a post-index memory operand.

   For example:
   p += emit_ldr (p, x0, sp, postindex_memory_operand (16));  */

static struct aarch64_memory_operand
postindex_memory_operand (int32_t index)
{
  return (struct aarch64_memory_operand) { MEMORY_OPERAND_POSTINDEX, index };
}

/* System control registers.  These special registers can be written and
   read with the MRS and MSR instructions.

   - NZCV: Condition flags.  GDB refers to this register under the CPSR
	   name.
   - FPSR: Floating-point status register.
   - FPCR: Floating-point control registers.
   - TPIDR_EL0: Software thread ID register.  */

enum aarch64_system_control_registers
{
  /*          op0           op1           crn          crm          op2  */
  NZCV =      (0x1 << 14) | (0x3 << 11) | (0x4 << 7) | (0x2 << 3) | 0x0,
  FPSR =      (0x1 << 14) | (0x3 << 11) | (0x4 << 7) | (0x4 << 3) | 0x1,
  FPCR =      (0x1 << 14) | (0x3 << 11) | (0x4 << 7) | (0x4 << 3) | 0x0,
  TPIDR_EL0 = (0x1 << 14) | (0x3 << 11) | (0xd << 7) | (0x0 << 3) | 0x2
};

/* Helper macro to mask and shift a value into a bitfield.  */

#define ENCODE(val, size, offset) \
  ((uint32_t) ((val & ((1ULL << size) - 1)) << offset))

/* Write a 32-bit unsigned integer INSN info *BUF.  Return the number of
   instructions written (aka. 1).  */

static int
emit_insn (uint32_t *buf, uint32_t insn)
{
  *buf = insn;
  return 1;
}

/* Write a B or BL instruction into *BUF.

     B  #offset
     BL #offset

   IS_BL specifies if the link register should be updated.
   OFFSET is the immediate offset from the current PC.  It is
   byte-addressed but should be 4 bytes aligned.  It has a limited range of
   +/- 128MB (26 bits << 2).  */

static int
emit_b (uint32_t *buf, int is_bl, int32_t offset)
{
  uint32_t imm26 = ENCODE (offset >> 2, 26, 0);

  if (is_bl)
    return emit_insn (buf, BL | imm26);
  else
    return emit_insn (buf, B | imm26);
}

/* Write a BCOND instruction into *BUF.

     B.COND #offset

   COND specifies the condition field.
   OFFSET is the immediate offset from the current PC.  It is
   byte-addressed but should be 4 bytes aligned.  It has a limited range of
   +/- 1MB (19 bits << 2).  */

static int
emit_bcond (uint32_t *buf, unsigned cond, int32_t offset)
{
  return emit_insn (buf, BCOND | ENCODE (offset >> 2, 19, 5)
		    | ENCODE (cond, 4, 0));
}

/* Write a CBZ or CBNZ instruction into *BUF.

     CBZ  rt, #offset
     CBNZ rt, #offset

   IS_CBNZ distinguishes between CBZ and CBNZ instructions.
   RN is the register to test.
   OFFSET is the immediate offset from the current PC.  It is
   byte-addressed but should be 4 bytes aligned.  It has a limited range of
   +/- 1MB (19 bits << 2).  */

static int
emit_cb (uint32_t *buf, int is_cbnz, struct aarch64_register rt,
	 int32_t offset)
{
  uint32_t imm19 = ENCODE (offset >> 2, 19, 5);
  uint32_t sf = ENCODE (rt.is64, 1, 31);

  if (is_cbnz)
    return emit_insn (buf, CBNZ | sf | imm19 | ENCODE (rt.num, 5, 0));
  else
    return emit_insn (buf, CBZ | sf | imm19 | ENCODE (rt.num, 5, 0));
}

/* Write a TBZ or TBNZ instruction into *BUF.

     TBZ  rt, #bit, #offset
     TBNZ rt, #bit, #offset

   IS_TBNZ distinguishes between TBZ and TBNZ instructions.
   RT is the register to test.
   BIT is the index of the bit to test in register RT.
   OFFSET is the immediate offset from the current PC.  It is
   byte-addressed but should be 4 bytes aligned.  It has a limited range of
   +/- 32KB (14 bits << 2).  */

static int
emit_tb (uint32_t *buf, int is_tbnz, unsigned bit,
	 struct aarch64_register rt, int32_t offset)
{
  uint32_t imm14 = ENCODE (offset >> 2, 14, 5);
  uint32_t b40 = ENCODE (bit, 5, 19);
  uint32_t b5 = ENCODE (bit >> 5, 1, 31);

  if (is_tbnz)
    return emit_insn (buf, TBNZ | b5 | b40 | imm14 | ENCODE (rt.num, 5, 0));
  else
    return emit_insn (buf, TBZ | b5 | b40 | imm14 | ENCODE (rt.num, 5, 0));
}

/* Write a BLR instruction into *BUF.

     BLR rn

   RN is the register to branch to.  */

static int
emit_blr (uint32_t *buf, struct aarch64_register rn)
{
  return emit_insn (buf, BLR | ENCODE (rn.num, 5, 5));
}

/* Write a RET instruction into *BUF.

     RET xn

   RN is the register to branch to.  */

static int
emit_ret (uint32_t *buf, struct aarch64_register rn)
{
  return emit_insn (buf, RET | ENCODE (rn.num, 5, 5));
}

static int
emit_load_store_pair (uint32_t *buf, enum aarch64_opcodes opcode,
		      struct aarch64_register rt,
		      struct aarch64_register rt2,
		      struct aarch64_register rn,
		      struct aarch64_memory_operand operand)
{
  uint32_t opc;
  uint32_t pre_index;
  uint32_t write_back;

  if (rt.is64)
    opc = ENCODE (2, 2, 30);
  else
    opc = ENCODE (0, 2, 30);

  switch (operand.type)
    {
    case MEMORY_OPERAND_OFFSET:
      {
	pre_index = ENCODE (1, 1, 24);
	write_back = ENCODE (0, 1, 23);
	break;
      }
    case MEMORY_OPERAND_POSTINDEX:
      {
	pre_index = ENCODE (0, 1, 24);
	write_back = ENCODE (1, 1, 23);
	break;
      }
    case MEMORY_OPERAND_PREINDEX:
      {
	pre_index = ENCODE (1, 1, 24);
	write_back = ENCODE (1, 1, 23);
	break;
      }
    default:
      return 0;
    }

  return emit_insn (buf, opcode | opc | pre_index | write_back
		    | ENCODE (operand.index >> 3, 7, 15) | ENCODE (rt2.num, 5, 10)
		    | ENCODE (rn.num, 5, 5) | ENCODE (rt.num, 5, 0));
}

/* Write a STP instruction into *BUF.

     STP rt, rt2, [rn, #offset]
     STP rt, rt2, [rn, #index]!
     STP rt, rt2, [rn], #index

   RT and RT2 are the registers to store.
   RN is the base address register.
   OFFSET is the immediate to add to the base address.  It is limited to a
   -512 .. 504 range (7 bits << 3).  */

static int
emit_stp (uint32_t *buf, struct aarch64_register rt,
	  struct aarch64_register rt2, struct aarch64_register rn,
	  struct aarch64_memory_operand operand)
{
  return emit_load_store_pair (buf, STP, rt, rt2, rn, operand);
}

/* Write a LDP instruction into *BUF.

     LDP rt, rt2, [rn, #offset]
     LDP rt, rt2, [rn, #index]!
     LDP rt, rt2, [rn], #index

   RT and RT2 are the registers to store.
   RN is the base address register.
   OFFSET is the immediate to add to the base address.  It is limited to a
   -512 .. 504 range (7 bits << 3).  */

static int
emit_ldp (uint32_t *buf, struct aarch64_register rt,
	  struct aarch64_register rt2, struct aarch64_register rn,
	  struct aarch64_memory_operand operand)
{
  return emit_load_store_pair (buf, LDP, rt, rt2, rn, operand);
}

/* Write a LDP (SIMD&VFP) instruction using Q registers into *BUF.

     LDP qt, qt2, [rn, #offset]

   RT and RT2 are the Q registers to store.
   RN is the base address register.
   OFFSET is the immediate to add to the base address.  It is limited to
   -1024 .. 1008 range (7 bits << 4).  */

static int
emit_ldp_q_offset (uint32_t *buf, unsigned rt, unsigned rt2,
		   struct aarch64_register rn, int32_t offset)
{
  uint32_t opc = ENCODE (2, 2, 30);
  uint32_t pre_index = ENCODE (1, 1, 24);

  return emit_insn (buf, LDP_SIMD_VFP | opc | pre_index
		    | ENCODE (offset >> 4, 7, 15) | ENCODE (rt2, 5, 10)
		    | ENCODE (rn.num, 5, 5) | ENCODE (rt, 5, 0));
}

/* Write a STP (SIMD&VFP) instruction using Q registers into *BUF.

     STP qt, qt2, [rn, #offset]

   RT and RT2 are the Q registers to store.
   RN is the base address register.
   OFFSET is the immediate to add to the base address.  It is limited to
   -1024 .. 1008 range (7 bits << 4).  */

static int
emit_stp_q_offset (uint32_t *buf, unsigned rt, unsigned rt2,
		   struct aarch64_register rn, int32_t offset)
{
  uint32_t opc = ENCODE (2, 2, 30);
  uint32_t pre_index = ENCODE (1, 1, 24);

  return emit_insn (buf, STP_SIMD_VFP | opc | pre_index
		    | ENCODE (offset >> 4, 7, 15) | ENCODE (rt2, 5, 10)
		    | ENCODE (rn.num, 5, 5) | ENCODE (rt, 5, 0));
}

/* Helper function emitting a load or store instruction.  */

static int
emit_load_store (uint32_t *buf, uint32_t size, enum aarch64_opcodes opcode,
		 struct aarch64_register rt, struct aarch64_register rn,
		 struct aarch64_memory_operand operand)
{
  uint32_t op;

  switch (operand.type)
    {
    case MEMORY_OPERAND_OFFSET:
      {
	op = ENCODE (1, 1, 24);

	return emit_insn (buf, opcode | ENCODE (size, 2, 30) | op
			  | ENCODE (operand.index >> 3, 12, 10)
			  | ENCODE (rn.num, 5, 5) | ENCODE (rt.num, 5, 0));
      }
    case MEMORY_OPERAND_POSTINDEX:
      {
	uint32_t post_index = ENCODE (1, 2, 10);

	op = ENCODE (0, 1, 24);

	return emit_insn (buf, opcode | ENCODE (size, 2, 30) | op
			  | post_index | ENCODE (operand.index, 9, 12)
			  | ENCODE (rn.num, 5, 5) | ENCODE (rt.num, 5, 0));
      }
    case MEMORY_OPERAND_PREINDEX:
      {
	uint32_t pre_index = ENCODE (3, 2, 10);

	op = ENCODE (0, 1, 24);

	return emit_insn (buf, opcode | ENCODE (size, 2, 30) | op
			  | pre_index | ENCODE (operand.index, 9, 12)
			  | ENCODE (rn.num, 5, 5) | ENCODE (rt.num, 5, 0));
      }
    default:
      return 0;
    }
}

/* Write a LDR instruction into *BUF.

     LDR rt, [rn, #offset]
     LDR rt, [rn, #index]!
     LDR rt, [rn], #index

   RT is the register to store.
   RN is the base address register.
   OFFSET is the immediate to add to the base address.  It is limited to
   0 .. 32760 range (12 bits << 3).  */

static int
emit_ldr (uint32_t *buf, struct aarch64_register rt,
	  struct aarch64_register rn, struct aarch64_memory_operand operand)
{
  return emit_load_store (buf, rt.is64 ? 3 : 2, LDR, rt, rn, operand);
}

/* Write a LDRH instruction into *BUF.

     LDRH wt, [xn, #offset]
     LDRH wt, [xn, #index]!
     LDRH wt, [xn], #index

   RT is the register to store.
   RN is the base address register.
   OFFSET is the immediate to add to the base address.  It is limited to
   0 .. 32760 range (12 bits << 3).  */

static int
emit_ldrh (uint32_t *buf, struct aarch64_register rt,
	   struct aarch64_register rn,
	   struct aarch64_memory_operand operand)
{
  return emit_load_store (buf, 1, LDR, rt, rn, operand);
}

/* Write a LDRB instruction into *BUF.

     LDRB wt, [xn, #offset]
     LDRB wt, [xn, #index]!
     LDRB wt, [xn], #index

   RT is the register to store.
   RN is the base address register.
   OFFSET is the immediate to add to the base address.  It is limited to
   0 .. 32760 range (12 bits << 3).  */

static int
emit_ldrb (uint32_t *buf, struct aarch64_register rt,
	   struct aarch64_register rn,
	   struct aarch64_memory_operand operand)
{
  return emit_load_store (buf, 0, LDR, rt, rn, operand);
}

/* Write a LDRSW instruction into *BUF.  The register size is 64-bit.

     LDRSW xt, [rn, #offset]
     LDRSW xt, [rn, #index]!
     LDRSW xt, [rn], #index

   RT is the register to store.
   RN is the base address register.
   OFFSET is the immediate to add to the base address.  It is limited to
   0 .. 16380 range (12 bits << 2).  */

static int
emit_ldrsw (uint32_t *buf, struct aarch64_register rt,
		   struct aarch64_register rn,
		   struct aarch64_memory_operand operand)
{
  return emit_load_store (buf, 3, LDRSW, rt, rn, operand);
}

/* Write a STR instruction into *BUF.

     STR rt, [rn, #offset]
     STR rt, [rn, #index]!
     STR rt, [rn], #index

   RT is the register to store.
   RN is the base address register.
   OFFSET is the immediate to add to the base address.  It is limited to
   0 .. 32760 range (12 bits << 3).  */

static int
emit_str (uint32_t *buf, struct aarch64_register rt,
	  struct aarch64_register rn,
	  struct aarch64_memory_operand operand)
{
  return emit_load_store (buf, rt.is64 ? 3 : 2, STR, rt, rn, operand);
}

/* Helper function emitting an exclusive load or store instruction.  */

static int
emit_load_store_exclusive (uint32_t *buf, uint32_t size,
			   enum aarch64_opcodes opcode,
			   struct aarch64_register rs,
			   struct aarch64_register rt,
			   struct aarch64_register rt2,
			   struct aarch64_register rn)
{
  return emit_insn (buf, opcode | ENCODE (size, 2, 30)
		    | ENCODE (rs.num, 5, 16) | ENCODE (rt2.num, 5, 10)
		    | ENCODE (rn.num, 5, 5) | ENCODE (rt.num, 5, 0));
}

/* Write a LAXR instruction into *BUF.

     LDAXR rt, [xn]

   RT is the destination register.
   RN is the base address register.  */

static int
emit_ldaxr (uint32_t *buf, struct aarch64_register rt,
	    struct aarch64_register rn)
{
  return emit_load_store_exclusive (buf, rt.is64 ? 3 : 2, LDAXR, xzr, rt,
				    xzr, rn);
}

/* Write a STXR instruction into *BUF.

     STXR ws, rt, [xn]

   RS is the result register, it indicates if the store succeeded or not.
   RT is the destination register.
   RN is the base address register.  */

static int
emit_stxr (uint32_t *buf, struct aarch64_register rs,
	   struct aarch64_register rt, struct aarch64_register rn)
{
  return emit_load_store_exclusive (buf, rt.is64 ? 3 : 2, STXR, rs, rt,
				    xzr, rn);
}

/* Write a STLR instruction into *BUF.

     STLR rt, [xn]

   RT is the register to store.
   RN is the base address register.  */

static int
emit_stlr (uint32_t *buf, struct aarch64_register rt,
	   struct aarch64_register rn)
{
  return emit_load_store_exclusive (buf, rt.is64 ? 3 : 2, STLR, xzr, rt,
				    xzr, rn);
}

/* Helper function for data processing instructions with register sources.  */

static int
emit_data_processing_reg (uint32_t *buf, enum aarch64_opcodes opcode,
			  struct aarch64_register rd,
			  struct aarch64_register rn,
			  struct aarch64_register rm)
{
  uint32_t size = ENCODE (rd.is64, 1, 31);

  return emit_insn (buf, opcode | size | ENCODE (rm.num, 5, 16)
		    | ENCODE (rn.num, 5, 5) | ENCODE (rd.num, 5, 0));
}

/* Helper function for data processing instructions taking either a register
   or an immediate.  */

static int
emit_data_processing (uint32_t *buf, enum aarch64_opcodes opcode,
		      struct aarch64_register rd,
		      struct aarch64_register rn,
		      struct aarch64_operand operand)
{
  uint32_t size = ENCODE (rd.is64, 1, 31);
  /* The opcode is different for register and immediate source operands.  */
  uint32_t operand_opcode;

  if (operand.type == OPERAND_IMMEDIATE)
    {
      /* xxx1 000x xxxx xxxx xxxx xxxx xxxx xxxx */
      operand_opcode = ENCODE (8, 4, 25);

      return emit_insn (buf, opcode | operand_opcode | size
			| ENCODE (operand.imm, 12, 10)
			| ENCODE (rn.num, 5, 5) | ENCODE (rd.num, 5, 0));
    }
  else
    {
      /* xxx0 101x xxxx xxxx xxxx xxxx xxxx xxxx */
      operand_opcode = ENCODE (5, 4, 25);

      return emit_data_processing_reg (buf, opcode | operand_opcode, rd,
				       rn, operand.reg);
    }
}

/* Write an ADD instruction into *BUF.

     ADD rd, rn, #imm
     ADD rd, rn, rm

   This function handles both an immediate and register add.

   RD is the destination register.
   RN is the input register.
   OPERAND is the source operand, either of type OPERAND_IMMEDIATE or
   OPERAND_REGISTER.  */

static int
emit_add (uint32_t *buf, struct aarch64_register rd,
	  struct aarch64_register rn, struct aarch64_operand operand)
{
  return emit_data_processing (buf, ADD, rd, rn, operand);
}

/* Write a SUB instruction into *BUF.

     SUB rd, rn, #imm
     SUB rd, rn, rm

   This function handles both an immediate and register sub.

   RD is the destination register.
   RN is the input register.
   IMM is the immediate to substract to RN.  */

static int
emit_sub (uint32_t *buf, struct aarch64_register rd,
	  struct aarch64_register rn, struct aarch64_operand operand)
{
  return emit_data_processing (buf, SUB, rd, rn, operand);
}

/* Write a MOV instruction into *BUF.

     MOV rd, #imm
     MOV rd, rm

   This function handles both a wide immediate move and a register move,
   with the condition that the source register is not xzr.  xzr and the
   stack pointer share the same encoding and this function only supports
   the stack pointer.

   RD is the destination register.
   OPERAND is the source operand, either of type OPERAND_IMMEDIATE or
   OPERAND_REGISTER.  */

static int
emit_mov (uint32_t *buf, struct aarch64_register rd,
	  struct aarch64_operand operand)
{
  if (operand.type == OPERAND_IMMEDIATE)
    {
      uint32_t size = ENCODE (rd.is64, 1, 31);
      /* Do not shift the immediate.  */
      uint32_t shift = ENCODE (0, 2, 21);

      return emit_insn (buf, MOV | size | shift
			| ENCODE (operand.imm, 16, 5)
			| ENCODE (rd.num, 5, 0));
    }
  else
    return emit_add (buf, rd, operand.reg, immediate_operand (0));
}

/* Write a MOVK instruction into *BUF.

     MOVK rd, #imm, lsl #shift

   RD is the destination register.
   IMM is the immediate.
   SHIFT is the logical shift left to apply to IMM.   */

static int
emit_movk (uint32_t *buf, struct aarch64_register rd, uint32_t imm, unsigned shift)
{
  uint32_t size = ENCODE (rd.is64, 1, 31);

  return emit_insn (buf, MOVK | size | ENCODE (shift, 2, 21) |
		    ENCODE (imm, 16, 5) | ENCODE (rd.num, 5, 0));
}

/* Write instructions into *BUF in order to move ADDR into a register.
   ADDR can be a 64-bit value.

   This function will emit a series of MOV and MOVK instructions, such as:

     MOV  xd, #(addr)
     MOVK xd, #(addr >> 16), lsl #16
     MOVK xd, #(addr >> 32), lsl #32
     MOVK xd, #(addr >> 48), lsl #48  */

static int
emit_mov_addr (uint32_t *buf, struct aarch64_register rd, CORE_ADDR addr)
{
  uint32_t *p = buf;

  /* The MOV (wide immediate) instruction clears to top bits of the
     register.  */
  p += emit_mov (p, rd, immediate_operand (addr & 0xffff));

  if ((addr >> 16) != 0)
    p += emit_movk (p, rd, (addr >> 16) & 0xffff, 1);
  else
    return p - buf;

  if ((addr >> 32) != 0)
    p += emit_movk (p, rd, (addr >> 32) & 0xffff, 2);
  else
    return p - buf;

  if ((addr >> 48) != 0)
    p += emit_movk (p, rd, (addr >> 48) & 0xffff, 3);

  return p - buf;
}

/* Write a SUBS instruction into *BUF.

     SUBS rd, rn, rm

   This instruction update the condition flags.

   RD is the destination register.
   RN and RM are the source registers.  */

static int
emit_subs (uint32_t *buf, struct aarch64_register rd,
	   struct aarch64_register rn, struct aarch64_operand operand)
{
  return emit_data_processing (buf, SUBS, rd, rn, operand);
}

/* Write a CMP instruction into *BUF.

     CMP rn, rm

   This instruction is an alias of SUBS xzr, rn, rm.

   RN and RM are the registers to compare.  */

static int
emit_cmp (uint32_t *buf, struct aarch64_register rn,
	      struct aarch64_operand operand)
{
  return emit_subs (buf, xzr, rn, operand);
}

/* Write a AND instruction into *BUF.

     AND rd, rn, rm

   RD is the destination register.
   RN and RM are the source registers.  */

static int
emit_and (uint32_t *buf, struct aarch64_register rd,
	  struct aarch64_register rn, struct aarch64_register rm)
{
  return emit_data_processing_reg (buf, AND, rd, rn, rm);
}

/* Write a ORR instruction into *BUF.

     ORR rd, rn, rm

   RD is the destination register.
   RN and RM are the source registers.  */

static int
emit_orr (uint32_t *buf, struct aarch64_register rd,
	  struct aarch64_register rn, struct aarch64_register rm)
{
  return emit_data_processing_reg (buf, ORR, rd, rn, rm);
}

/* Write a ORN instruction into *BUF.

     ORN rd, rn, rm

   RD is the destination register.
   RN and RM are the source registers.  */

static int
emit_orn (uint32_t *buf, struct aarch64_register rd,
	  struct aarch64_register rn, struct aarch64_register rm)
{
  return emit_data_processing_reg (buf, ORN, rd, rn, rm);
}

/* Write a EOR instruction into *BUF.

     EOR rd, rn, rm

   RD is the destination register.
   RN and RM are the source registers.  */

static int
emit_eor (uint32_t *buf, struct aarch64_register rd,
	  struct aarch64_register rn, struct aarch64_register rm)
{
  return emit_data_processing_reg (buf, EOR, rd, rn, rm);
}

/* Write a MVN instruction into *BUF.

     MVN rd, rm

   This is an alias for ORN rd, xzr, rm.

   RD is the destination register.
   RM is the source register.  */

static int
emit_mvn (uint32_t *buf, struct aarch64_register rd,
	  struct aarch64_register rm)
{
  return emit_orn (buf, rd, xzr, rm);
}

/* Write a LSLV instruction into *BUF.

     LSLV rd, rn, rm

   RD is the destination register.
   RN and RM are the source registers.  */

static int
emit_lslv (uint32_t *buf, struct aarch64_register rd,
	   struct aarch64_register rn, struct aarch64_register rm)
{
  return emit_data_processing_reg (buf, LSLV, rd, rn, rm);
}

/* Write a LSRV instruction into *BUF.

     LSRV rd, rn, rm

   RD is the destination register.
   RN and RM are the source registers.  */

static int
emit_lsrv (uint32_t *buf, struct aarch64_register rd,
	   struct aarch64_register rn, struct aarch64_register rm)
{
  return emit_data_processing_reg (buf, LSRV, rd, rn, rm);
}

/* Write a ASRV instruction into *BUF.

     ASRV rd, rn, rm

   RD is the destination register.
   RN and RM are the source registers.  */

static int
emit_asrv (uint32_t *buf, struct aarch64_register rd,
	   struct aarch64_register rn, struct aarch64_register rm)
{
  return emit_data_processing_reg (buf, ASRV, rd, rn, rm);
}

/* Write a MUL instruction into *BUF.

     MUL rd, rn, rm

   RD is the destination register.
   RN and RM are the source registers.  */

static int
emit_mul (uint32_t *buf, struct aarch64_register rd,
	  struct aarch64_register rn, struct aarch64_register rm)
{
  return emit_data_processing_reg (buf, MUL, rd, rn, rm);
}

/* Write a MRS instruction into *BUF.  The register size is 64-bit.

     MRS xt, system_reg

   RT is the destination register.
   SYSTEM_REG is special purpose register to read.  */

static int
emit_mrs (uint32_t *buf, struct aarch64_register rt,
	  enum aarch64_system_control_registers system_reg)
{
  return emit_insn (buf, MRS | ENCODE (system_reg, 15, 5)
		    | ENCODE (rt.num, 5, 0));
}

/* Write a MSR instruction into *BUF.  The register size is 64-bit.

     MSR system_reg, xt

   SYSTEM_REG is special purpose register to write.
   RT is the input register.  */

static int
emit_msr (uint32_t *buf, enum aarch64_system_control_registers system_reg,
	  struct aarch64_register rt)
{
  return emit_insn (buf, MSR | ENCODE (system_reg, 15, 5)
		    | ENCODE (rt.num, 5, 0));
}

/* Write a SEVL instruction into *BUF.

   This is a hint instruction telling the hardware to trigger an event.  */

static int
emit_sevl (uint32_t *buf)
{
  return emit_insn (buf, SEVL);
}

/* Write a WFE instruction into *BUF.

   This is a hint instruction telling the hardware to wait for an event.  */

static int
emit_wfe (uint32_t *buf)
{
  return emit_insn (buf, WFE);
}

/* Write a SBFM instruction into *BUF.

     SBFM rd, rn, #immr, #imms

   This instruction moves the bits from #immr to #imms into the
   destination, sign extending the result.

   RD is the destination register.
   RN is the source register.
   IMMR is the bit number to start at (least significant bit).
   IMMS is the bit number to stop at (most significant bit).  */

static int
emit_sbfm (uint32_t *buf, struct aarch64_register rd,
	   struct aarch64_register rn, uint32_t immr, uint32_t imms)
{
  uint32_t size = ENCODE (rd.is64, 1, 31);
  uint32_t n = ENCODE (rd.is64, 1, 22);

  return emit_insn (buf, SBFM | size | n | ENCODE (immr, 6, 16)
		    | ENCODE (imms, 6, 10) | ENCODE (rn.num, 5, 5)
		    | ENCODE (rd.num, 5, 0));
}

/* Write a SBFX instruction into *BUF.

     SBFX rd, rn, #lsb, #width

   This instruction moves #width bits from #lsb into the destination, sign
   extending the result.  This is an alias for:

     SBFM rd, rn, #lsb, #(lsb + width - 1)

   RD is the destination register.
   RN is the source register.
   LSB is the bit number to start at (least significant bit).
   WIDTH is the number of bits to move.  */

static int
emit_sbfx (uint32_t *buf, struct aarch64_register rd,
	   struct aarch64_register rn, uint32_t lsb, uint32_t width)
{
  return emit_sbfm (buf, rd, rn, lsb, lsb + width - 1);
}

/* Write a UBFM instruction into *BUF.

     UBFM rd, rn, #immr, #imms

   This instruction moves the bits from #immr to #imms into the
   destination, extending the result with zeros.

   RD is the destination register.
   RN is the source register.
   IMMR is the bit number to start at (least significant bit).
   IMMS is the bit number to stop at (most significant bit).  */

static int
emit_ubfm (uint32_t *buf, struct aarch64_register rd,
	   struct aarch64_register rn, uint32_t immr, uint32_t imms)
{
  uint32_t size = ENCODE (rd.is64, 1, 31);
  uint32_t n = ENCODE (rd.is64, 1, 22);

  return emit_insn (buf, UBFM | size | n | ENCODE (immr, 6, 16)
		    | ENCODE (imms, 6, 10) | ENCODE (rn.num, 5, 5)
		    | ENCODE (rd.num, 5, 0));
}

/* Write a UBFX instruction into *BUF.

     UBFX rd, rn, #lsb, #width

   This instruction moves #width bits from #lsb into the destination,
   extending the result with zeros.  This is an alias for:

     UBFM rd, rn, #lsb, #(lsb + width - 1)

   RD is the destination register.
   RN is the source register.
   LSB is the bit number to start at (least significant bit).
   WIDTH is the number of bits to move.  */

static int
emit_ubfx (uint32_t *buf, struct aarch64_register rd,
	   struct aarch64_register rn, uint32_t lsb, uint32_t width)
{
  return emit_ubfm (buf, rd, rn, lsb, lsb + width - 1);
}

/* Write a CSINC instruction into *BUF.

     CSINC rd, rn, rm, cond

   This instruction conditionally increments rn or rm and places the result
   in rd.  rn is chosen is the condition is true.

   RD is the destination register.
   RN and RM are the source registers.
   COND is the encoded condition.  */

static int
emit_csinc (uint32_t *buf, struct aarch64_register rd,
	    struct aarch64_register rn, struct aarch64_register rm,
	    unsigned cond)
{
  uint32_t size = ENCODE (rd.is64, 1, 31);

  return emit_insn (buf, CSINC | size | ENCODE (rm.num, 5, 16)
		    | ENCODE (cond, 4, 12) | ENCODE (rn.num, 5, 5)
		    | ENCODE (rd.num, 5, 0));
}

/* Write a CSET instruction into *BUF.

     CSET rd, cond

   This instruction conditionally write 1 or 0 in the destination register.
   1 is written if the condition is true.  This is an alias for:

     CSINC rd, xzr, xzr, !cond

   Note that the condition needs to be inverted.

   RD is the destination register.
   RN and RM are the source registers.
   COND is the encoded condition.  */

static int
emit_cset (uint32_t *buf, struct aarch64_register rd, unsigned cond)
{
  /* The least significant bit of the condition needs toggling in order to
     invert it.  */
  return emit_csinc (buf, rd, xzr, xzr, cond ^ 0x1);
}

/* Write a NOP instruction into *BUF.  */

static int
emit_nop (uint32_t *buf)
{
  return emit_insn (buf, NOP);
}

/* Write LEN instructions from BUF into the inferior memory at *TO.

   Note instructions are always little endian on AArch64, unlike data.  */

static void
append_insns (CORE_ADDR *to, size_t len, const uint32_t *buf)
{
  size_t byte_len = len * sizeof (uint32_t);
#if (__BYTE_ORDER == __BIG_ENDIAN)
  uint32_t *le_buf = xmalloc (byte_len);
  size_t i;

  for (i = 0; i < len; i++)
    le_buf[i] = htole32 (buf[i]);

  write_inferior_memory (*to, (const unsigned char *) le_buf, byte_len);

  xfree (le_buf);
#else
  write_inferior_memory (*to, (const unsigned char *) buf, byte_len);
#endif

  *to += byte_len;
}

/* Helper function.  Return 1 if VAL can be encoded in BITS bits.  */

static int
can_encode_int32 (int32_t val, unsigned bits)
{
  /* This must be an arithemic shift.  */
  int32_t rest = val >> bits;

  return rest == 0 || rest == -1;
}

/* Relocate an instruction from OLDLOC to *TO.  This function will also
   increment TO by the number of bytes the new instruction(s) take(s).

   PC relative instructions need to be handled specifically:

   - B/BL
   - B.COND
   - CBZ/CBNZ
   - TBZ/TBNZ
   - ADR/ADRP
   - LDR/LDRSW (literal)  */

static void
aarch64_relocate_instruction (CORE_ADDR *to, CORE_ADDR oldloc)
{
  uint32_t buf[32];
  uint32_t *p = buf;
  uint32_t insn;

  int is_bl;
  int is64;
  int is_sw;
  int is_cbnz;
  int is_tbnz;
  int is_adrp;
  unsigned rn;
  unsigned rt;
  unsigned rd;
  unsigned cond;
  unsigned bit;
  int32_t offset;

  target_read_uint32 (oldloc, &insn);

  if (aarch64_decode_b (oldloc, insn, &is_bl, &offset))
    {
      offset = (oldloc - *to + offset);

      if (can_encode_int32 (offset, 28))
	p += emit_b (p, is_bl, offset);
      else
	return;
    }
  else if (aarch64_decode_bcond (oldloc, insn, &cond, &offset))
    {
      offset = (oldloc - *to + offset);

      if (can_encode_int32 (offset, 21))
	p += emit_bcond (p, cond, offset);
      else if (can_encode_int32 (offset, 28))
	{
	  /* The offset is out of range for a conditional branch
	     instruction but not for a unconditional branch.  We can use
	     the following instructions instead:

	       B.COND TAKEN    ; If cond is true, then jump to TAKEN.
	       B NOT_TAKEN     ; Else jump over TAKEN and continue.
	     TAKEN:
	       B #(offset - 8)
	     NOT_TAKEN:

	     */

	  p += emit_bcond (p, cond, 8);
	  p += emit_b (p, 0, 8);
	  p += emit_b (p, 0, offset - 8);
	}
      else
	return;
    }
  else if (aarch64_decode_cb (oldloc, insn, &is64, &is_cbnz, &rn, &offset))
    {
      offset = (oldloc - *to + offset);

      if (can_encode_int32 (offset, 21))
	p += emit_cb (p, is_cbnz, aarch64_register (rn, is64), offset);
      else if (can_encode_int32 (offset, 28))
	{
	  /* The offset is out of range for a compare and branch
	     instruction but not for a unconditional branch.  We can use
	     the following instructions instead:

	       CBZ xn, TAKEN   ; xn == 0, then jump to TAKEN.
	       B NOT_TAKEN     ; Else jump over TAKEN and continue.
	     TAKEN:
	       B #(offset - 8)
	     NOT_TAKEN:

	     */
	  p += emit_cb (p, is_cbnz, aarch64_register (rn, is64), 8);
	  p += emit_b (p, 0, 8);
	  p += emit_b (p, 0, offset - 8);
	}
      else
	return;
    }
  else if (aarch64_decode_tb (oldloc, insn, &is_tbnz, &bit, &rt, &offset))
    {
      offset = (oldloc - *to + offset);

      if (can_encode_int32 (offset, 16))
	p += emit_tb (p, is_tbnz, bit, aarch64_register (rt, 1), offset);
      else if (can_encode_int32 (offset, 28))
	{
	  /* The offset is out of range for a test bit and branch
	     instruction but not for a unconditional branch.  We can use
	     the following instructions instead:

	       TBZ xn, #bit, TAKEN ; xn[bit] == 0, then jump to TAKEN.
	       B NOT_TAKEN         ; Else jump over TAKEN and continue.
	     TAKEN:
	       B #(offset - 8)
	     NOT_TAKEN:

	     */
	  p += emit_tb (p, is_tbnz, bit, aarch64_register (rt, 1), 8);
	  p += emit_b (p, 0, 8);
	  p += emit_b (p, 0, offset - 8);
	}
      else
	return;
    }
  else if (aarch64_decode_adr (oldloc, insn, &is_adrp, &rd, &offset))
    {

      /* We know exactly the address the ADR{P,} instruction will compute.
	 We can just write it to the destination register.  */
      CORE_ADDR address = oldloc + offset;

      if (is_adrp)
	{
	  /* Clear the lower 12 bits of the offset to get the 4K page.  */
	  p += emit_mov_addr (p, aarch64_register (rd, 1),
			      address & ~0xfff);
	}
      else
	p += emit_mov_addr (p, aarch64_register (rd, 1), address);
    }
  else if (aarch64_decode_ldr_literal (oldloc, insn, &is_sw, &is64, &rt,
				       &offset))
    {
      /* We know exactly what address to load from, and what register we
	 can use:

	   MOV xd, #(oldloc + offset)
	   MOVK xd, #((oldloc + offset) >> 16), lsl #16
	   ...

	   LDR xd, [xd] ; or LDRSW xd, [xd]

	 */
      CORE_ADDR address = oldloc + offset;

      p += emit_mov_addr (p, aarch64_register (rt, 1), address);

      if (is_sw)
	p += emit_ldrsw (p, aarch64_register (rt, 1),
			 aarch64_register (rt, 1),
			 offset_memory_operand (0));
      else
	p += emit_ldr (p, aarch64_register (rt, is64),
		       aarch64_register (rt, 1),
		       offset_memory_operand (0));
    }
  else
    {
      /* The instruction is not PC relative.  Just re-emit it at the new
	 location.  */
      p += emit_insn (p, insn);
    }

  append_insns (to, p - buf, buf);
}

/* Implementation of linux_target_ops method
   "install_fast_tracepoint_jump_pad".  */

static int
aarch64_install_fast_tracepoint_jump_pad (CORE_ADDR tpoint,
					  CORE_ADDR tpaddr,
					  CORE_ADDR collector,
					  CORE_ADDR lockaddr,
					  ULONGEST orig_size,
					  CORE_ADDR *jump_entry,
					  CORE_ADDR *trampoline,
					  ULONGEST *trampoline_size,
					  unsigned char *jjump_pad_insn,
					  ULONGEST *jjump_pad_insn_size,
					  CORE_ADDR *adjusted_insn_addr,
					  CORE_ADDR *adjusted_insn_addr_end,
					  char *err)
{
  uint32_t buf[256];
  uint32_t *p = buf;
  int32_t offset;
  int i;
  CORE_ADDR buildaddr = *jump_entry;

  /* We need to save the current state on the stack both to restore it
     later and to collect register values when the tracepoint is hit.

     The saved registers are pushed in a layout that needs to be in sync
     with aarch64_ft_collect_regmap (see linux-aarch64-ipa.c).  Later on
     the supply_fast_tracepoint_registers function will fill in the
     register cache from a pointer to saved registers on the stack we build
     here.

     For simplicity, we set the size of each cell on the stack to 16 bytes.
     This way one cell can hold any register type, from system registers
     to the 128 bit SIMD&FP registers.  Furthermore, the stack pointer
     has to be 16 bytes aligned anyway.

     Note that the CPSR register does not exist on AArch64.  Instead we
     can access system bits describing the process state with the
     MRS/MSR instructions, namely the condition flags.  We save them as
     if they are part of a CPSR register because that's how GDB
     interprets these system bits.  At the moment, only the condition
     flags are saved in CPSR (NZCV).

     Stack layout, each cell is 16 bytes (descending):

     High *-------- SIMD&FP registers from 31 down to 0. --------*
	  | q31                                                  |
	  .                                                      .
	  .                                                      . 32 cells
	  .                                                      .
	  | q0                                                   |
	  *---- General purpose registers from 30 down to 0. ----*
	  | x30                                                  |
	  .                                                      .
	  .                                                      . 31 cells
	  .                                                      .
	  | x0                                                   |
	  *------------- Special purpose registers. -------------*
	  | SP                                                   |
	  | PC                                                   |
	  | CPSR (NZCV)                                          | 5 cells
	  | FPSR                                                 |
	  | FPCR                                                 | <- SP + 16
	  *------------- collecting_t object --------------------*
	  | TPIDR_EL0               | struct tracepoint *        |
     Low  *------------------------------------------------------*

     After this stack is set up, we issue a call to the collector, passing
     it the saved registers at (SP + 16).  */

  /* Push SIMD&FP registers on the stack:

       SUB sp, sp, #(32 * 16)

       STP q30, q31, [sp, #(30 * 16)]
       ...
       STP q0, q1, [sp]

     */
  p += emit_sub (p, sp, sp, immediate_operand (32 * 16));
  for (i = 30; i >= 0; i -= 2)
    p += emit_stp_q_offset (p, i, i + 1, sp, i * 16);

  /* Push general puspose registers on the stack.  Note that we do not need
     to push x31 as it represents the xzr register and not the stack
     pointer in a STR instruction.

       SUB sp, sp, #(31 * 16)

       STR x30, [sp, #(30 * 16)]
       ...
       STR x0, [sp]

     */
  p += emit_sub (p, sp, sp, immediate_operand (31 * 16));
  for (i = 30; i >= 0; i -= 1)
    p += emit_str (p, aarch64_register (i, 1), sp,
		   offset_memory_operand (i * 16));

  /* Make space for 5 more cells.

       SUB sp, sp, #(5 * 16)

     */
  p += emit_sub (p, sp, sp, immediate_operand (5 * 16));


  /* Save SP:

       ADD x4, sp, #((32 + 31 + 5) * 16)
       STR x4, [sp, #(4 * 16)]

     */
  p += emit_add (p, x4, sp, immediate_operand ((32 + 31 + 5) * 16));
  p += emit_str (p, x4, sp, offset_memory_operand (4 * 16));

  /* Save PC (tracepoint address):

       MOV  x3, #(tpaddr)
       ...

       STR x3, [sp, #(3 * 16)]

     */

  p += emit_mov_addr (p, x3, tpaddr);
  p += emit_str (p, x3, sp, offset_memory_operand (3 * 16));

  /* Save CPSR (NZCV), FPSR and FPCR:

       MRS x2, nzcv
       MRS x1, fpsr
       MRS x0, fpcr

       STR x2, [sp, #(2 * 16)]
       STR x1, [sp, #(1 * 16)]
       STR x0, [sp, #(0 * 16)]

     */
  p += emit_mrs (p, x2, NZCV);
  p += emit_mrs (p, x1, FPSR);
  p += emit_mrs (p, x0, FPCR);
  p += emit_str (p, x2, sp, offset_memory_operand (2 * 16));
  p += emit_str (p, x1, sp, offset_memory_operand (1 * 16));
  p += emit_str (p, x0, sp, offset_memory_operand (0 * 16));

  /* Push the collecting_t object.  It consist of the address of the
     tracepoint and an ID for the current thread.  We get the latter by
     reading the tpidr_el0 system register.  It corresponds to the
     NT_ARM_TLS register accessible with ptrace.

       MOV x0, #(tpoint)
       ...

       MRS x1, tpidr_el0

       STP x0, x1, [sp, #-16]!

     */

  p += emit_mov_addr (p, x0, tpoint);
  p += emit_mrs (p, x1, TPIDR_EL0);
  p += emit_stp (p, x0, x1, sp, preindex_memory_operand (-16));

  /* Spin-lock:

     The shared memory for the lock is at lockaddr.  It will hold zero
     if no-one is holding the lock, otherwise it contains the address of
     the collecting_t object on the stack of the thread which acquired it.

     At this stage, the stack pointer points to this thread's collecting_t
     object.

     We use the following registers:
     - x0: Address of the lock.
     - x1: Pointer to collecting_t object.
     - x2: Scratch register.

       MOV x0, #(lockaddr)
       ...
       MOV x1, sp

       ; Trigger an event local to this core.  So the following WFE
       ; instruction is ignored.
       SEVL
     again:
       ; Wait for an event.  The event is triggered by either the SEVL
       ; or STLR instructions (store release).
       WFE

       ; Atomically read at lockaddr.  This marks the memory location as
       ; exclusive.  This instruction also has memory constraints which
       ; make sure all previous data reads and writes are done before
       ; executing it.
       LDAXR x2, [x0]

       ; Try again if another thread holds the lock.
       CBNZ x2, again

       ; We can lock it!  Write the address of the collecting_t object.
       ; This instruction will fail if the memory location is not marked
       ; as exclusive anymore.  If it succeeds, it will remove the
       ; exclusive mark on the memory location.  This way, if another
       ; thread executes this instruction before us, we will fail and try
       ; all over again.
       STXR w2, x1, [x0]
       CBNZ w2, again

     */

  p += emit_mov_addr (p, x0, lockaddr);
  p += emit_mov (p, x1, register_operand (sp));

  p += emit_sevl (p);
  p += emit_wfe (p);
  p += emit_ldaxr (p, x2, x0);
  p += emit_cb (p, 1, w2, -2 * 4);
  p += emit_stxr (p, w2, x1, x0);
  p += emit_cb (p, 1, x2, -4 * 4);

  /* Call collector (struct tracepoint *, unsigned char *):

       MOV x0, #(tpoint)
       ...

       ; Saved registers start after the collecting_t object.
       ADD x1, sp, #16

       ; We use an intra-procedure-call scratch register.
       MOV ip0, #(collector)
       ...

       ; And call back to C!
       BLR ip0

     */

  p += emit_mov_addr (p, x0, tpoint);
  p += emit_add (p, x1, sp, immediate_operand (16));

  p += emit_mov_addr (p, ip0, collector);
  p += emit_blr (p, ip0);

  /* Release the lock.

       MOV x0, #(lockaddr)
       ...

       ; This instruction is a normal store with memory ordering
       ; constraints.  Thanks to this we do not have to put a data
       ; barrier instruction to make sure all data read and writes are done
       ; before this instruction is executed.  Furthermore, this instrucion
       ; will trigger an event, letting other threads know they can grab
       ; the lock.
       STLR xzr, [x0]

     */
  p += emit_mov_addr (p, x0, lockaddr);
  p += emit_stlr (p, xzr, x0);

  /* Free collecting_t object:

       ADD sp, sp, #16

     */
  p += emit_add (p, sp, sp, immediate_operand (16));

  /* Restore CPSR (NZCV), FPSR and FPCR.  And free all special purpose
     registers from the stack.

       LDR x2, [sp, #(2 * 16)]
       LDR x1, [sp, #(1 * 16)]
       LDR x0, [sp, #(0 * 16)]

       MSR NZCV, x2
       MSR FPSR, x1
       MSR FPCR, x0

       ADD sp, sp #(5 * 16)

     */
  p += emit_ldr (p, x2, sp, offset_memory_operand (2 * 16));
  p += emit_ldr (p, x1, sp, offset_memory_operand (1 * 16));
  p += emit_ldr (p, x0, sp, offset_memory_operand (0 * 16));
  p += emit_msr (p, NZCV, x2);
  p += emit_msr (p, FPSR, x1);
  p += emit_msr (p, FPCR, x0);

  p += emit_add (p, sp, sp, immediate_operand (5 * 16));

  /* Pop general purpose registers:

       LDR x0, [sp]
       ...
       LDR x30, [sp, #(30 * 16)]

       ADD sp, sp, #(31 * 16)

     */
  for (i = 0; i <= 30; i += 1)
    p += emit_ldr (p, aarch64_register (i, 1), sp,
		   offset_memory_operand (i * 16));
  p += emit_add (p, sp, sp, immediate_operand (31 * 16));

  /* Pop SIMD&FP registers:

       LDP q0, q1, [sp]
       ...
       LDP q30, q31, [sp, #(30 * 16)]

       ADD sp, sp, #(32 * 16)

     */
  for (i = 0; i <= 30; i += 2)
    p += emit_ldp_q_offset (p, i, i + 1, sp, i * 16);
  p += emit_add (p, sp, sp, immediate_operand (32 * 16));

  /* Write the code into the inferior memory.  */
  append_insns (&buildaddr, p - buf, buf);

  /* Now emit the relocated instruction.  */
  *adjusted_insn_addr = buildaddr;
  aarch64_relocate_instruction (&buildaddr, tpaddr);
  *adjusted_insn_addr_end = buildaddr;

  /* We may not have been able to relocate the instruction.  */
  if (*adjusted_insn_addr == *adjusted_insn_addr_end)
    {
      sprintf (err,
	       "E.Could not relocate instruction from %s to %s.",
	       core_addr_to_string_nz (tpaddr),
	       core_addr_to_string_nz (buildaddr));
      return 1;
    }

  /* Go back to the start of the buffer.  */
  p = buf;

  /* Emit a branch back from the jump pad.  */
  offset = (tpaddr + orig_size - buildaddr);
  if (!can_encode_int32 (offset, 28))
    {
      sprintf (err,
	       "E.Jump back from jump pad too far from tracepoint "
	       "(offset 0x%" PRIx32 " cannot be encoded in 28 bits).",
	       offset);
      return 1;
    }

  p += emit_b (p, 0, offset);
  append_insns (&buildaddr, p - buf, buf);

  /* Give the caller a branch instruction into the jump pad.  */
  offset = (*jump_entry - tpaddr);
  if (!can_encode_int32 (offset, 28))
    {
      sprintf (err,
	       "E.Jump pad too far from tracepoint "
	       "(offset 0x%" PRIx32 " cannot be encoded in 28 bits).",
	       offset);
      return 1;
    }

  emit_b ((uint32_t *) jjump_pad_insn, 0, offset);
  *jjump_pad_insn_size = 4;

  /* Return the end address of our pad.  */
  *jump_entry = buildaddr;

  return 0;
}

/* Helper function writing LEN instructions from START into
   current_insn_ptr.  */

static void
emit_ops_insns (const uint32_t *start, int len)
{
  CORE_ADDR buildaddr = current_insn_ptr;

  if (debug_threads)
    debug_printf ("Adding %d instrucions at %s\n",
		  len, paddress (buildaddr));

  append_insns (&buildaddr, len, start);
  current_insn_ptr = buildaddr;
}

/* Pop a register from the stack.  */

static int
emit_pop (uint32_t *buf, struct aarch64_register rt)
{
  return emit_ldr (buf, rt, sp, postindex_memory_operand (1 * 16));
}

/* Push a register on the stack.  */

static int
emit_push (uint32_t *buf, struct aarch64_register rt)
{
  return emit_str (buf, rt, sp, preindex_memory_operand (-1 * 16));
}

/* Implementation of emit_ops method "emit_prologue".  */

static void
aarch64_emit_prologue (void)
{
  uint32_t buf[16];
  uint32_t *p = buf;

  /* This function emit a prologue for the following function prototype:

     enum eval_result_type f (unsigned char *regs,
			      ULONGEST *value);

     The first argument is a buffer of raw registers.  The second
     argument is the result of
     evaluating the expression, which will be set to whatever is on top of
     the stack at the end.

     The stack set up by the prologue is as such:

     High *------------------------------------------------------*
	  | LR                                                   |
	  | FP                                                   | <- FP
	  | x1  (ULONGEST *value)                                |
	  | x0  (unsigned char *regs)                            |
     Low  *------------------------------------------------------*

     As we are implementing a stack machine, each opcode can expand the
     stack so we never know how far we are from the data saved by this
     prologue.  In order to be able refer to value and regs later, we save
     the current stack pointer in the frame pointer.  This way, it is not
     clobbered when calling C functions.

     Finally, throughtout every operation, we are using register x0 as the
     top of the stack, and x1 as a scratch register.  */

  p += emit_stp (p, x0, x1, sp, preindex_memory_operand (-2 * 16));
  p += emit_str (p, lr, sp, offset_memory_operand (3 * 8));
  p += emit_str (p, fp, sp, offset_memory_operand (2 * 8));

  p += emit_add (p, fp, sp, immediate_operand (2 * 8));


  emit_ops_insns (buf, p - buf);
}

/* Implementation of emit_ops method "emit_epilogue".  */

static void
aarch64_emit_epilogue (void)
{
  uint32_t buf[16];
  uint32_t *p = buf;

  /* Store the result of the expression (x0) in *value.  */
  p += emit_sub (p, x1, fp, immediate_operand (1 * 8));
  p += emit_ldr (p, x1, x1, offset_memory_operand (0));
  p += emit_str (p, x0, x1, offset_memory_operand (0));

  /* Restore the previous state.  */
  p += emit_add (p, sp, fp, immediate_operand (2 * 8));
  p += emit_ldp (p, fp, lr, fp, offset_memory_operand (0));

  /* Return expr_eval_no_error.  */
  p += emit_mov (p, x0, immediate_operand (expr_eval_no_error));
  p += emit_ret (p, lr);

  emit_ops_insns (buf, p - buf);
}

/* Implementation of emit_ops method "emit_add".  */

static void
aarch64_emit_add (void)
{
  uint32_t buf[16];
  uint32_t *p = buf;

  p += emit_pop (p, x1);
  p += emit_add (p, x0, x0, register_operand (x1));

  emit_ops_insns (buf, p - buf);
}

/* Implementation of emit_ops method "emit_sub".  */

static void
aarch64_emit_sub (void)
{
  uint32_t buf[16];
  uint32_t *p = buf;

  p += emit_pop (p, x1);
  p += emit_sub (p, x0, x0, register_operand (x1));

  emit_ops_insns (buf, p - buf);
}

/* Implementation of emit_ops method "emit_mul".  */

static void
aarch64_emit_mul (void)
{
  uint32_t buf[16];
  uint32_t *p = buf;

  p += emit_pop (p, x1);
  p += emit_mul (p, x0, x1, x0);

  emit_ops_insns (buf, p - buf);
}

/* Implementation of emit_ops method "emit_lsh".  */

static void
aarch64_emit_lsh (void)
{
  uint32_t buf[16];
  uint32_t *p = buf;

  p += emit_pop (p, x1);
  p += emit_lslv (p, x0, x1, x0);

  emit_ops_insns (buf, p - buf);
}

/* Implementation of emit_ops method "emit_rsh_signed".  */

static void
aarch64_emit_rsh_signed (void)
{
  uint32_t buf[16];
  uint32_t *p = buf;

  p += emit_pop (p, x1);
  p += emit_asrv (p, x0, x1, x0);

  emit_ops_insns (buf, p - buf);
}

/* Implementation of emit_ops method "emit_rsh_unsigned".  */

static void
aarch64_emit_rsh_unsigned (void)
{
  uint32_t buf[16];
  uint32_t *p = buf;

  p += emit_pop (p, x1);
  p += emit_lsrv (p, x0, x1, x0);

  emit_ops_insns (buf, p - buf);
}

/* Implementation of emit_ops method "emit_ext".  */

static void
aarch64_emit_ext (int arg)
{
  uint32_t buf[16];
  uint32_t *p = buf;

  p += emit_sbfx (p, x0, x0, 0, arg);

  emit_ops_insns (buf, p - buf);
}

/* Implementation of emit_ops method "emit_log_not".  */

static void
aarch64_emit_log_not (void)
{
  uint32_t buf[16];
  uint32_t *p = buf;

  /* If the top of the stack is 0, replace it with 1.  Else replace it with
     0.  */

  p += emit_cmp (p, x0, immediate_operand (0));
  p += emit_cset (p, x0, EQ);

  emit_ops_insns (buf, p - buf);
}

/* Implementation of emit_ops method "emit_bit_and".  */

static void
aarch64_emit_bit_and (void)
{
  uint32_t buf[16];
  uint32_t *p = buf;

  p += emit_pop (p, x1);
  p += emit_and (p, x0, x0, x1);

  emit_ops_insns (buf, p - buf);
}

/* Implementation of emit_ops method "emit_bit_or".  */

static void
aarch64_emit_bit_or (void)
{
  uint32_t buf[16];
  uint32_t *p = buf;

  p += emit_pop (p, x1);
  p += emit_orr (p, x0, x0, x1);

  emit_ops_insns (buf, p - buf);
}

/* Implementation of emit_ops method "emit_bit_xor".  */

static void
aarch64_emit_bit_xor (void)
{
  uint32_t buf[16];
  uint32_t *p = buf;

  p += emit_pop (p, x1);
  p += emit_eor (p, x0, x0, x1);

  emit_ops_insns (buf, p - buf);
}

/* Implementation of emit_ops method "emit_bit_not".  */

static void
aarch64_emit_bit_not (void)
{
  uint32_t buf[16];
  uint32_t *p = buf;

  p += emit_mvn (p, x0, x0);

  emit_ops_insns (buf, p - buf);
}

/* Implementation of emit_ops method "emit_equal".  */

static void
aarch64_emit_equal (void)
{
  uint32_t buf[16];
  uint32_t *p = buf;

  p += emit_pop (p, x1);
  p += emit_cmp (p, x0, register_operand (x1));
  p += emit_cset (p, x0, EQ);

  emit_ops_insns (buf, p - buf);
}

/* Implementation of emit_ops method "emit_less_signed".  */

static void
aarch64_emit_less_signed (void)
{
  uint32_t buf[16];
  uint32_t *p = buf;

  p += emit_pop (p, x1);
  p += emit_cmp (p, x1, register_operand (x0));
  p += emit_cset (p, x0, LT);

  emit_ops_insns (buf, p - buf);
}

/* Implementation of emit_ops method "emit_less_unsigned".  */

static void
aarch64_emit_less_unsigned (void)
{
  uint32_t buf[16];
  uint32_t *p = buf;

  p += emit_pop (p, x1);
  p += emit_cmp (p, x1, register_operand (x0));
  p += emit_cset (p, x0, LO);

  emit_ops_insns (buf, p - buf);
}

/* Implementation of emit_ops method "emit_ref".  */

static void
aarch64_emit_ref (int size)
{
  uint32_t buf[16];
  uint32_t *p = buf;

  switch (size)
    {
    case 1:
      p += emit_ldrb (p, w0, x0, offset_memory_operand (0));
      break;
    case 2:
      p += emit_ldrh (p, w0, x0, offset_memory_operand (0));
      break;
    case 4:
      p += emit_ldr (p, w0, x0, offset_memory_operand (0));
      break;
    case 8:
      p += emit_ldr (p, x0, x0, offset_memory_operand (0));
      break;
    default:
      /* Unknown size, bail on compilation.  */
      emit_error = 1;
      break;
    }

  emit_ops_insns (buf, p - buf);
}

/* Implementation of emit_ops method "emit_if_goto".  */

static void
aarch64_emit_if_goto (int *offset_p, int *size_p)
{
  uint32_t buf[16];
  uint32_t *p = buf;

  /* The Z flag is set or cleared here.  */
  p += emit_cmp (p, x0, immediate_operand (0));
  /* This instruction must not change the Z flag.  */
  p += emit_pop (p, x0);
  /* Branch over the next instruction if x0 == 0.  */
  p += emit_bcond (p, EQ, 8);

  /* The NOP instruction will be patched with an unconditional branch.  */
  if (offset_p)
    *offset_p = (p - buf) * 4;
  if (size_p)
    *size_p = 4;
  p += emit_nop (p);

  emit_ops_insns (buf, p - buf);
}

/* Implementation of emit_ops method "emit_goto".  */

static void
aarch64_emit_goto (int *offset_p, int *size_p)
{
  uint32_t buf[16];
  uint32_t *p = buf;

  /* The NOP instruction will be patched with an unconditional branch.  */
  if (offset_p)
    *offset_p = 0;
  if (size_p)
    *size_p = 4;
  p += emit_nop (p);

  emit_ops_insns (buf, p - buf);
}

/* Implementation of emit_ops method "write_goto_address".  */

void
aarch64_write_goto_address (CORE_ADDR from, CORE_ADDR to, int size)
{
  uint32_t insn;

  emit_b (&insn, 0, to - from);
  append_insns (&from, 1, &insn);
}

/* Implementation of emit_ops method "emit_const".  */

static void
aarch64_emit_const (LONGEST num)
{
  uint32_t buf[16];
  uint32_t *p = buf;

  p += emit_mov_addr (p, x0, num);

  emit_ops_insns (buf, p - buf);
}

/* Implementation of emit_ops method "emit_call".  */

static void
aarch64_emit_call (CORE_ADDR fn)
{
  uint32_t buf[16];
  uint32_t *p = buf;

  p += emit_mov_addr (p, ip0, fn);
  p += emit_blr (p, ip0);

  emit_ops_insns (buf, p - buf);
}

/* Implementation of emit_ops method "emit_reg".  */

static void
aarch64_emit_reg (int reg)
{
  uint32_t buf[16];
  uint32_t *p = buf;

  /* Set x0 to unsigned char *regs.  */
  p += emit_sub (p, x0, fp, immediate_operand (2 * 8));
  p += emit_ldr (p, x0, x0, offset_memory_operand (0));
  p += emit_mov (p, x1, immediate_operand (reg));

  emit_ops_insns (buf, p - buf);

  aarch64_emit_call (get_raw_reg_func_addr ());
}

/* Implementation of emit_ops method "emit_pop".  */

static void
aarch64_emit_pop (void)
{
  uint32_t buf[16];
  uint32_t *p = buf;

  p += emit_pop (p, x0);

  emit_ops_insns (buf, p - buf);
}

/* Implementation of emit_ops method "emit_stack_flush".  */

static void
aarch64_emit_stack_flush (void)
{
  uint32_t buf[16];
  uint32_t *p = buf;

  p += emit_push (p, x0);

  emit_ops_insns (buf, p - buf);
}

/* Implementation of emit_ops method "emit_zero_ext".  */

static void
aarch64_emit_zero_ext (int arg)
{
  uint32_t buf[16];
  uint32_t *p = buf;

  p += emit_ubfx (p, x0, x0, 0, arg);

  emit_ops_insns (buf, p - buf);
}

/* Implementation of emit_ops method "emit_swap".  */

static void
aarch64_emit_swap (void)
{
  uint32_t buf[16];
  uint32_t *p = buf;

  p += emit_ldr (p, x1, sp, offset_memory_operand (0 * 16));
  p += emit_str (p, x0, sp, offset_memory_operand (0 * 16));
  p += emit_mov (p, x0, register_operand (x1));

  emit_ops_insns (buf, p - buf);
}

/* Implementation of emit_ops method "emit_stack_adjust".  */

static void
aarch64_emit_stack_adjust (int n)
{
  /* This is not needed with our design.  */
  uint32_t buf[16];
  uint32_t *p = buf;

  p += emit_add (p, sp, sp, immediate_operand (n * 16));

  emit_ops_insns (buf, p - buf);
}

/* Implementation of emit_ops method "emit_int_call_1".  */

static void
aarch64_emit_int_call_1 (CORE_ADDR fn, int arg1)
{
  uint32_t buf[16];
  uint32_t *p = buf;

  p += emit_mov (p, x0, immediate_operand (arg1));

  emit_ops_insns (buf, p - buf);

  aarch64_emit_call (fn);
}

/* Implementation of emit_ops method "emit_void_call_2".  */

static void
aarch64_emit_void_call_2 (CORE_ADDR fn, int arg1)
{
  uint32_t buf[16];
  uint32_t *p = buf;

  /* Push x0 on the stack.  */
  aarch64_emit_stack_flush ();

  /* Setup arguments for the function call:

     x0: arg1
     x1: top of the stack

       MOV x1, x0
       MOV x0, #arg1  */

  p += emit_mov (p, x1, register_operand (x0));
  p += emit_mov (p, x0, immediate_operand (arg1));

  emit_ops_insns (buf, p - buf);

  aarch64_emit_call (fn);

  /* Restore x0.  */
  aarch64_emit_pop ();
}

/* Implementation of emit_ops method "emit_eq_goto".  */

static void
aarch64_emit_eq_goto (int *offset_p, int *size_p)
{
  uint32_t buf[16];
  uint32_t *p = buf;

  p += emit_pop (p, x1);
  p += emit_cmp (p, x1, register_operand (x0));
  /* Branch over the next instruction if x0 != x1.  */
  p += emit_bcond (p, NE, 8);
  /* The NOP instruction will be patched with an unconditional branch.  */
  if (offset_p)
    *offset_p = (p - buf) * 4;
  if (size_p)
    *size_p = 4;
  p += emit_nop (p);

  emit_ops_insns (buf, p - buf);
}

/* Implementation of emit_ops method "emit_ne_goto".  */

static void
aarch64_emit_ne_goto (int *offset_p, int *size_p)
{
  uint32_t buf[16];
  uint32_t *p = buf;

  p += emit_pop (p, x1);
  p += emit_cmp (p, x1, register_operand (x0));
  /* Branch over the next instruction if x0 == x1.  */
  p += emit_bcond (p, EQ, 8);
  /* The NOP instruction will be patched with an unconditional branch.  */
  if (offset_p)
    *offset_p = (p - buf) * 4;
  if (size_p)
    *size_p = 4;
  p += emit_nop (p);

  emit_ops_insns (buf, p - buf);
}

/* Implementation of emit_ops method "emit_lt_goto".  */

static void
aarch64_emit_lt_goto (int *offset_p, int *size_p)
{
  uint32_t buf[16];
  uint32_t *p = buf;

  p += emit_pop (p, x1);
  p += emit_cmp (p, x1, register_operand (x0));
  /* Branch over the next instruction if x0 >= x1.  */
  p += emit_bcond (p, GE, 8);
  /* The NOP instruction will be patched with an unconditional branch.  */
  if (offset_p)
    *offset_p = (p - buf) * 4;
  if (size_p)
    *size_p = 4;
  p += emit_nop (p);

  emit_ops_insns (buf, p - buf);
}

/* Implementation of emit_ops method "emit_le_goto".  */

static void
aarch64_emit_le_goto (int *offset_p, int *size_p)
{
  uint32_t buf[16];
  uint32_t *p = buf;

  p += emit_pop (p, x1);
  p += emit_cmp (p, x1, register_operand (x0));
  /* Branch over the next instruction if x0 > x1.  */
  p += emit_bcond (p, GT, 8);
  /* The NOP instruction will be patched with an unconditional branch.  */
  if (offset_p)
    *offset_p = (p - buf) * 4;
  if (size_p)
    *size_p = 4;
  p += emit_nop (p);

  emit_ops_insns (buf, p - buf);
}

/* Implementation of emit_ops method "emit_gt_goto".  */

static void
aarch64_emit_gt_goto (int *offset_p, int *size_p)
{
  uint32_t buf[16];
  uint32_t *p = buf;

  p += emit_pop (p, x1);
  p += emit_cmp (p, x1, register_operand (x0));
  /* Branch over the next instruction if x0 <= x1.  */
  p += emit_bcond (p, LE, 8);
  /* The NOP instruction will be patched with an unconditional branch.  */
  if (offset_p)
    *offset_p = (p - buf) * 4;
  if (size_p)
    *size_p = 4;
  p += emit_nop (p);

  emit_ops_insns (buf, p - buf);
}

/* Implementation of emit_ops method "emit_ge_got".  */

static void
aarch64_emit_ge_got (int *offset_p, int *size_p)
{
  uint32_t buf[16];
  uint32_t *p = buf;

  p += emit_pop (p, x1);
  p += emit_cmp (p, x1, register_operand (x0));
  /* Branch over the next instruction if x0 <= x1.  */
  p += emit_bcond (p, LT, 8);
  /* The NOP instruction will be patched with an unconditional branch.  */
  if (offset_p)
    *offset_p = (p - buf) * 4;
  if (size_p)
    *size_p = 4;
  p += emit_nop (p);

  emit_ops_insns (buf, p - buf);
}

static struct emit_ops aarch64_emit_ops_impl =
{
  aarch64_emit_prologue,
  aarch64_emit_epilogue,
  aarch64_emit_add,
  aarch64_emit_sub,
  aarch64_emit_mul,
  aarch64_emit_lsh,
  aarch64_emit_rsh_signed,
  aarch64_emit_rsh_unsigned,
  aarch64_emit_ext,
  aarch64_emit_log_not,
  aarch64_emit_bit_and,
  aarch64_emit_bit_or,
  aarch64_emit_bit_xor,
  aarch64_emit_bit_not,
  aarch64_emit_equal,
  aarch64_emit_less_signed,
  aarch64_emit_less_unsigned,
  aarch64_emit_ref,
  aarch64_emit_if_goto,
  aarch64_emit_goto,
  aarch64_write_goto_address,
  aarch64_emit_const,
  aarch64_emit_call,
  aarch64_emit_reg,
  aarch64_emit_pop,
  aarch64_emit_stack_flush,
  aarch64_emit_zero_ext,
  aarch64_emit_swap,
  aarch64_emit_stack_adjust,
  aarch64_emit_int_call_1,
  aarch64_emit_void_call_2,
  aarch64_emit_eq_goto,
  aarch64_emit_ne_goto,
  aarch64_emit_lt_goto,
  aarch64_emit_le_goto,
  aarch64_emit_gt_goto,
  aarch64_emit_ge_got,
};

/* Implementation of linux_target_ops method "emit_ops".  */

static struct emit_ops *
aarch64_emit_ops (void)
{
  return &aarch64_emit_ops_impl;
}

/* Implementation of linux_target_ops method
   "get_min_fast_tracepoint_insn_len".  */

static int
aarch64_get_min_fast_tracepoint_insn_len (void)
{
  return 4;
}

/* Implementation of linux_target_ops method "supports_range_stepping".  */

static int
aarch64_supports_range_stepping (void)
{
  return 1;
}

struct linux_target_ops the_low_target =
{
  aarch64_arch_setup,
  aarch64_regs_info,
  aarch64_cannot_fetch_register,
  aarch64_cannot_store_register,
  NULL, /* fetch_register */
  aarch64_get_pc,
  aarch64_set_pc,
  (const unsigned char *) &aarch64_breakpoint,
  aarch64_breakpoint_len,
  NULL, /* breakpoint_reinsert_addr */
  0,    /* decr_pc_after_break */
  aarch64_breakpoint_at,
  aarch64_supports_z_point_type,
  aarch64_insert_point,
  aarch64_remove_point,
  aarch64_stopped_by_watchpoint,
  aarch64_stopped_data_address,
  NULL, /* collect_ptrace_register */
  NULL, /* supply_ptrace_register */
  aarch64_linux_siginfo_fixup,
  aarch64_linux_new_process,
  aarch64_linux_new_thread,
  aarch64_linux_new_fork,
  aarch64_linux_prepare_to_resume,
  NULL, /* process_qsupported */
  aarch64_supports_tracepoints,
  aarch64_get_thread_area,
  aarch64_install_fast_tracepoint_jump_pad,
  aarch64_emit_ops,
  aarch64_get_min_fast_tracepoint_insn_len,
  aarch64_supports_range_stepping,
};

void
initialize_low_arch (void)
{
  init_registers_aarch64 ();

  initialize_low_arch_aarch32 ();

  initialize_regsets_info (&aarch64_regsets_info);
}