@conference {1435482, title = {Analyzing and Improving Fault Tolerance of Learning-Based Navigation Systems}, booktitle = {58th ACM/IEEE Design Automation Conference (DAC)}, year = {2021}, abstract = {Learning-based navigation systems are widely used in autonomous applications, such as robotics, unmanned vehicles and drones. Specialized hardware accelerators have been proposed for high-performance and energy-efficiency for such navigational tasks. However, transient and permanent faults are increasing in hardware systems and can catastrophically violate tasks safety. Meanwhile, traditional redundancy-based protection methods are challenging to deploy on resource-constrained edge applications. In this paper, we experimentally evaluate the resilience of navigation systems with respect to algorithms, fault models and data types from both RL training and inference. We further propose two efficient fault mitigation techniques that achieve 2x success rate and 39\% quality-of-flight improvement in learning-based navigation systems.}, url = {https://doi.org/10.48550/arXiv.2111.04957}, author = {Wan, Zishen and Aqeel Anwar and Yu-Shun Hsiao and Tianyu Jia and Vijay Janapa Reddi and Arijit Raychowdhury} } @article {1435526, title = {EdgeBERT: Sentence-Level Energy Optimizations for Latency-Aware Multi-Task NLP Inference}, journal = {IEEE/ACM International Symposium on Microarchitecture (MICRO 2021)}, year = {2021}, abstract = {Transformer-based language models such as BERT provide significant accuracy improvement for a multitude of natural language processing (NLP) tasks. However, their hefty computational and memory demands make them challenging to deploy to resource-constrained edge platforms with strict latency requirements. We present EdgeBERT, an in-depth algorithm-hardware co-design for latency-aware energy optimization for multi-task NLP. EdgeBERT employs entropy-based early exit predication in order to perform dynamic voltage-frequency scaling (DVFS), at a sentence granularity, for minimal energy consumption while adhering to a prescribed target latency. Computation and memory footprint overheads are further alleviated by employing a calibrated combination of adaptive attention span, selective network pruning, and floating-point quantization. Furthermore, in order to maximize the synergistic benefits of these algorithms in always-on and intermediate edge computing settings, we specialize a 12nm scalable hardware accelerator system, integrating a fast-switching low-dropout voltage regulator (LDO), an all-digital phase-locked loop (ADPLL), as well as, high-density embedded non-volatile memories (eNVMs) wherein the sparse floating-point bit encodings of the shared multi-task parameters are carefully stored. Altogether, latency-aware multi-task NLP inference acceleration on the EdgeBERT hardware system generates up to 7x, 2.5x, and 53x lower energy compared to the conventional inference without early stopping, the latency-unbounded early exit approach, and CUDA adaptations on an Nvidia Jetson Tegra X2 mobile GPU, respectively.}, url = {https://doi.org/10.48550/arXiv.2011.14203}, author = {Tambe, Thierry and Coleman Hooper and Lillian Pentecost and Tianyu Jia and Yang, En-Yu and Marco Donato and Victor Sanh and Paul Whatmough and Alexander M. Rush and David Brooks and Gu-Yeon Wei} } @proceedings {1435470, title = {Application-driven Design Exploration for Dense Ferroelectric Embedded Non-volatile Memories}, year = {2021}, address = {Boston, MA, USA}, abstract = {The memory wall bottleneck is a key challenge across many data-intensive applications. Multi-level FeFET-based embedded non-volatile memories are a promising solution for denser and more energy-efficient on-chip memory. However, reliable multi-level cell storage requires careful optimizations to minimize the design overhead costs. In this work, we investigate the interplay between FeFET device characteristics, programming schemes, and memory array architecture, and explore different design choices to optimize performance, energy, area, and accuracy metrics for critical data-intensive workloads. From our cross-stack design exploration, we find that we can store DNN weights and social network graphs at a density of over 8MB/mm 2 and sub-2ns read access latency without loss in application accuracy.}, url = {https://doi.org/10.1109/ISLPED52811.2021.9502489}, author = {M. M. Sharifi and L. Pentecost and R. Rajaei and A. Kazemi and Q. Lou and G.-Y. Wei and D. Brooks and K. Ni and X. S. Hu and M. Niemier and M. Donato} } @article {1435335, title = {Gradient Disaggregation: Breaking Privacy in Federated Learning by Reconstructing the User Participant Matrix}, year = {2021}, abstract = {We show that aggregated model updates in federated learning may be insecure. An untrusted central server may disaggregate user updates from sums of updates across participants given repeated observations, enabling the server to recover privileged information about individual users{\textquoteright} private training data via traditional gradient inference attacks. Our method revolves around reconstructing participant information (e.g: which rounds of training users participated in) from aggregated model updates by leveraging summary information from device analytics commonly used to monitor, debug, and manage federated learning systems. Our attack is parallelizable and we successfully disaggregate user updates on settings with up to thousands of participants. We quantitatively and qualitatively demonstrate significant improvements in the capability of various inference attacks on the disaggregated updates. Our attack enables the attribution of learned properties to individual users, violating anonymity, and shows that a determined central server may undermine the secure aggregation protocol to break individual users{\textquoteright} data privacy in federated learning.
\ }, url = {https://doi.org/10.48550/arXiv.2106.06089}, author = {Maximilian Lam and Gu-Yeon Wei and David Brooks and Vijay Janapa Reddi and Mitzenmacher, Michael} } @article {1435484, title = {Mavfi: An end-to-end fault analysis framework with anomaly detection and recovery for micro aerial vehicles}, year = {2021}, abstract = {Reliability and safety are critical in autonomous machine services, such as autonomous vehicles and aerial drones. In this paper, we first present an open-source Micro Aerial Vehicles (MAVs) reliability analysis framework, MAVFI, to characterize transient fault{\textquoteright}s impacts on the end-to-end flight metrics, e.g., flight time, success rate. Based on our framework, it is observed that the end-to-end fault tolerance analysis is essential for characterizing system reliability. We demonstrate the planning and control stages are more vulnerable to transient faults than the visual perception stage in the common "Perception-Planning-Control (PPC)" compute pipeline. Furthermore, to improve the reliability of the MAV system, we propose two low overhead anomaly-based transient fault detection and recovery schemes based on Gaussian statistical models and autoencoder neural networks. We validate our anomaly fault protection schemes with a variety of simulated photo-realistic environments on both Intel i9 CPU and ARM Cortex-A57 on Nvidia TX2 platform. It is demonstrated that the autoencoder-based scheme can improve the system reliability by 100\% recovering failure cases with less than 0.0062\% computational overhead in best-case scenarios. In addition, MAVFI framework can be used for other ROS-based cyber-physical applications and is open-sourced at\ this https URL.}, url = {https://doi.org/10.48550/arXiv.2105.12882}, author = {Yu-Shun Hsiao and Wan, Zishen and Tianyu Jia and Radhika Ghosal and Arijit Raychowdhury and David Brooks and Gu-Yeon Wei and Vijay Janapa Reddi} } @article {1434918, title = {RecPipe: Co-designing Models and Hardware to Jointly Optimize Recommendation Quality and Performance}, journal = {MICRO {\textquoteright}21: MICRO-54: 54th Annual IEEE/ACM International Symposium on Microarchitecture}, year = {2021}, pages = {870{\textendash}884}, abstract = {Deep learning recommendation systems must provide high quality, personalized content under strict tail-latency targets and high system loads. This paper presents RecPipe, a system to jointly optimize recommendation quality and inference performance. Central to RecPipe is decomposing recommendation models into multi-stage pipelines to maintain quality while reducing compute complexity and exposing distinct parallelism opportunities. RecPipe implements an inference scheduler to map multi-stage recommendation engines onto commodity, heterogeneous platforms (e.g., CPUs, GPUs).While the hardware-aware scheduling improves ranking efficiency, the commodity platforms suffer from many limitations requiring specialized hardware. Thus, we design RecPipeAccel (RPAccel), a custom accelerator that jointly optimizes quality, tail-latency, and system throughput. RPAc-cel is designed specifically to exploit the distinct design space opened via RecPipe. In particular, RPAccel processes queries in sub-batches to pipeline recommendation stages, implements dual static and dynamic embedding caches, a set of top-k filtering units, and a reconfigurable systolic array. Com-pared to prior-art and at iso-quality, we demonstrate that RPAccel improves latency and throughput by 3x and 6x.}, url = {https://doi.org/10.48550/arXiv.2105.08820}, author = {Udit Gupta and Hsia, Samuel and Jeff Zhang and Wilkening, Mark and Javin Pombra and Hsien-Hsin S. Lee and Gu-Yeon Wei and Carole-Jean Wu and David Brooks} } @article {1435529, title = {Robomorphic Computing: A Design Methodology for Domain-Specific Accelerators Parameterized by Robot Morphology}, journal = {Architectural Support for Programming Languages and Operating Systems (ASPLOS{\textquoteright}21)}, year = {2021}, pages = {674{\textendash}686}, abstract = {

Robotics applications have hard time constraints and heavy computational burdens that can greatly benefit from domain-specific hardware accelerators. For the latency-critical problem of robot motion planning and control, there exists a performance gap of at least an order of magnitude between joint actuator response rates and state-of-the-art software solutions. Hardware acceleration can close this gap, but it is essential to define automated hardware design flows to keep the design process agile as applications and robot platforms evolve. To address this challenge, we introduce robomorphic computing: a methodology to transform robot morphology into a customized hardware accelerator morphology. We (i) present this design methodology, using robot topology and structure to exploit parallelism and matrix sparsity patterns in accelerator hardware; (ii) use the methodology to generate a parameterized accelerator design for the gradient of rigid body dynamics, a key kernel in motion planning; (iii) evaluate FPGA and synthesized ASIC implementations of this accelerator for an industrial manipulator robot; and (iv) describe how the design can be automatically customized for other robot models. Our FPGA accelerator achieves speedups of 8{\texttimes} and 86{\texttimes} over CPU and GPU when executing a single dynamics gradient computation. It maintains speedups of 1.9{\texttimes} to 2.9{\texttimes} over CPU and GPU, including computation and I/O round-trip latency, when deployed as a coprocessor to a host CPU for processing multiple dynamics gradient computations. ASIC synthesis indicates an additional 7.2{\texttimes} speedup for single computation latency. We describe how this principled approach generalizes to more complex robot platforms, such as quadrupeds and humanoids, as well as to other computational kernels in robotics, outlining a path forward for future robomorphic computing accelerators.

}, url = {https://doi.org/10.1145/3445814.3446746}, author = {Sabrina M. Neuman and Brian Plancher and Thomas Bourgeat and Tambe, Thierry and Srinivas Devadas and Vijay Janapa Reddi} } @article {1435528, title = {From DSLs to Accelerator-rich Platform Implementations: Addressing the Mapping Gap}, journal = {Workshop on Languages, Tools, and Techniques for Accelerator Design (LATTE{\textquoteright}21)}, year = {2021}, url = {https://capra.cs.cornell.edu/latte21/}, author = {Bo-Yuan Huang and Steven Lyubomirsky and Tambe, Thierry and Yi Li and Mike He and Gus Smith and Gu-Yeon Wei and Aarti Gupta and Sharad Malik and Zachary Tatlock} } @article {1435525, title = {A 25mm2 SoC for IoT Devices with 18ms Noise Robust Speech-to-Text Latency via Bayesian Speech Denoising and Attention-Based Sequence-to-Sequence DNN Speech Recognition in 16nm FinFET}, journal = {International Solid-State Circuits Conference (ISSCC{\textquoteright}21)}, year = {2021}, abstract = {Automatic speech recognition (ASR) using deep learning is essential for user interfaces on IoT devices. However, previously published ASR chips [4-7] do not consider realistic operating conditions, which are typically noisy and may include more than one speaker. Furthermore, several of these works have implemented only small-vocabulary tasks, such as keyword-spotting (KWS), where context-blind deep neural network (DNN) algorithms are adequate. However, for large-vocabulary tasks (e.g., \>100k words), the more complex bidirectional RNNs with an attention mechanism [1] provide context learning in long sequences, which improve ASR accuracy by up to 62\% on the 200kwords LibriSpeech dataset, compared to a simpler unidirectional RNN (Fig. 9.8.1). Attention-based networks emphasize the most relevant parts of the source sequence during each decoding time step. In doing so, the encoder sequence is treated as a soft-addressable memory whose positions are weighted based on the state of the decoder RNN. Bidirectional RNNs learn past and future temporal information by concatenating forward and backward time steps.}, url = {https://doi.org/10.1109/ISSCC42613.2021.9366062}, author = {Tambe, Thierry and Yang, En-Yu and Glenn G. Ko and Chai, Yuji and Coleman Hooper and Marco Donato and Paul N. Whatmough and Alexander M. Rush and David Brooks and Gu-Yeon Wei} } @article {1434916, title = {RecSSD: Near Data Processing for Solid State Drive Based Recommendation Inference}, journal = {ASPLOS 2021: Proceedings of the 26th ACM International Conference on Architectural Support for Programming Languages and Operating Systems}, year = {2021}, pages = {717{\textendash}729}, abstract = {Neural personalized recommendationmodelsareusedacrossawide Samuel Hsia Harvard University Cambridge, Massachusetts, USA shsia@g.harvard.edu David Brooks Harvard University Cambridge, Massachusetts, USA dbrooks@eecs.harvard.edu USA. ACM, New York, NY, USA, 13 pages. https://doi.org/10.1145/3445814. 3446763 variety of datacenter applications including search, social media, and entertainment. State-of-the-art models comprise large embedding tables that have billions of parameters requiring large memory capacities. Unfortunately, large and fast DRAM-based memories levy high infrastructure costs. Conventional SSD-based storage solutions offer an order of magnitude larger capacity, but have worse read latency and bandwidth, degrading inference performance. RecSSD is a near data processing based SSD memory system customized for neural recommendation inference that reduces end-to-end model inference latency by 2{\texttimes} compared to using COTS SSDs across eight industry-representative models.}, url = {https://doi.org/10.48550/arXiv.2102.00075}, author = {Wilkening, Mark and Udit Gupta and Hsia, Samuel and Caroline Trippel and Carole-Jean Wu and David Brooks and Gu-Yeon Wei} }