5 ' " "' (3 "' '1I"\I"~° I:(|;I'1 "‘3‘,“ jg." ~5'.I ‘ .. I555 5m“ . 5:55de 5 . £555 .. .,5-45 - ‘5 ’5' ..'.".' 5 555 'I5 ...'5IRI5YJH%II '55%b@55 "I-"". é:""'55}b( ." ." -5 '.' 5,5 'J';flu* 1m . Ivz‘vqu'fgzfir'}; 'l'-".l.5 "'""'¥: 5 "“5 5555.. 55 :' 55:: 5155-55555 .‘5 . 5-. .. . 5.,«55. . 5-“. I I. 55.55"“. '3" . ,. . . (III IJIII. 9‘ ‘.7W" (5’5- : ‘ 5 u”. ' UIt'I ‘ .. 5"}.5 ,. '. - " ". MIC. ' '1 ' ' . ' 1.? '5 .Iifi'fi-quu'k' r '55'..'..‘5."""'5'."',"I“~I-.""'1"-"'I"95}' I§5¢-"..'55551«"‘ '55' .- 1%" ..I'-i.“¢'.<;~.'.'5;"~‘"2 9'? 53%“ a: . .~ :5 .'5'5 55 .. .:'.".I:II.' r554; : .I. .I.' .- .\ P" “ 5'. 4 - I.; ' ""5'"1"-~"“ "-"" I".:""fl':°."5" '1'3"'I' “IQ is" '.' ' : 3" W225! '52": 55" WEEK . ""5. m“ "1 , ' .W r I"-','\"\'."7.I '.‘] . 5' $5 . L. W” ". ;v1I-’.'--_I ’ u »- 1 . {gfflwflw III... I. 55"" 55"";3. 5'5 If. ""1" A35" Ihg'bu , . , .5555... 5...... 5 "" It"; IJII‘V 55%; 1'" "L I "Cipvo ' 5'5"" I"'"I?5[I"7."'"T,""~'IIEFII"C""3'11""):II v";- ”Ml 155M35354€55'9~555“'55“,“#-: I55 5 5, 5. I'm,‘ '.. .I f' 35:51.. ".55'13'J3I5 5:. I45" .. '5. 5 . _'1I"Jéél'."-a;:lf\" F2 :de 5"..." - . 5554’“ 5 5 . f I III. {5. 'ii'U'IHI". "3".“ ' '4'“. $ IIAI'I . ”515%“. ‘_" 5'} 5‘( if?” P“ (J:II I 5.;(J?;,%&{r:.;“‘\~ea '33:": 3”};‘5’1 [My 2‘! "1 Jgu‘l“ "5." '-5" 5 '."".'5:": \ "h" "I"~"'1"'1""Si' "I 2': "'.' "' ISA“ '.':P- ' ' ‘i'll't'flvfi' "‘itfie hfinzégw 'é'fi" flux -“-.\' $35K {WI ",55‘W. flu?“ '.'"; I ' I56; . ',1' """". '"75'5 " ... . . '..I5I""'\ "A" '. "".5‘ " '5. . ' . . "I “.55" (555.13” "-- " .'o" ""-'-'I"\' '.'" 'd'f’ '51] "'“byfluflffi'h '515I'§",~"""""'f“ 5. r5'.', "'.f"‘":""'1‘~‘>d'r¢" $65“ 7" . . I J'III'I'IV 55 .1" 'H5 "3 I ""33"" "I:,'I\.:VL.'L."‘: ['.I'I'. I355 -P ."I'I "5.55 "'{T-.'.;I;..'-..'_"}'.-'5 .. ."3'<7II"-..§5'3'1. 515'!” ,. ”5951'.'¢5.'r'9.5575 :5: II. .. 3-4.5; “5515315555153... '55,. 555' 55* : "1.55.55 " “ ' NW 5155;, I‘I: 5' '~ I ,. 515555.. 5; ( 5'. ‘ I ’III" ;'. :‘C' ' . '.'I'T‘. f‘g . '5'". . - a ‘I t :."x'\'5‘"':.n:‘"‘1'"\a"" 'd‘ “D .g " .5'." I 5 5"3 I"'aI:' ‘7‘; 1')va '.‘ 32"" 3"\ "'".""',"E:.!‘."'I; 1.:ng I'Mt' 1‘3, '. {Eek—f" .blé}: ,II'E‘N ‘.‘ 55,-55'W‘5e ”VF'QW" -. .'5 "'.' .ImLfr-."'" {WV . I .5 J! I ,5 . '1'“ on 5 l '5'" "'J+ "'11" '.I'I {-.. 9",. IL? {I ”9"“ V 1'5" ‘11! [5‘ "'"'J"I"’ .I“ '1’.I:’;)v ... KT: [’Urrwfi '5' .I .I.. I,. “2.3; .55- I. 555155553 In * '.I " .5, wgdbhfl' i‘d y‘j'bb. , '1'" "‘IJI' 31”,." '5"? '"'-'I'2':"' I'll INN ."."' 5' (V 13"“.er W” '.‘ '.' I o I" I" I 5"“ II III .:, I'I’Sf'w '.’.1""' ‘ 5...". 451:5“. "5I5I5’.5cu."."’""‘ ‘.'. """ """""""'I' ""3"" ML" 5.“ 8"". '29.! M‘II‘ 'I'..'I°.,"|'\E‘:“I.¥4l 3"“ ""g"" "'"I': 'I."‘,' 5I5 2'5“, '.':Ilt'd ‘ 1". "' " p" “"""'4\' 41' {'{J'V'JCI‘ WI) " ' ".‘f' '~.'.'.;-:"' ,IN“. 5:", 5'" I" ' | I ' ,' ' .. i" 5 55"I“'-'.:.~I5»'5..13.5555 I... .H 5’5"“; '.'. I!5I’-.‘-.~-.'T555:-.~.5555: ‘ 535 5.. 5.55.. I. . “5555' W 5-55" '. “I". J 5 ., 1 5 "II‘II ‘1. . '.I I . IIII'I'IIL'UL; [\an‘r'tv‘:’ J 1"rfl’t 115‘ '5". u. I'II:§HI‘ '.‘?“ ta ‘9: :5 ' . [It‘d/i‘le 0‘9": #1; I“ '.' ‘ v’b":1';\.(I ;":‘:" it?“ I :I"' ' "' ' " '.'-.3 .. (Ir V‘ "5 '5"555 ‘.' fox" “3" . '.'“ '5' "k"; 0' ‘9']. 3310‘ —PH'. J'y;v'I :z'I ".I J“; (E‘;' *0,“ .o‘u-o I I'I'I .' """'5 '."...r ‘.‘ I: I "."h'vw‘k." It. I”. "..|"I'f"‘""':. '. '.I .. ,."'Q$"'."f.\-' 51:15:: (35:5 - ”ix,“ "\KIIII." W". vv V "0-7: "'5 5"; “5‘“ I”, 5' @AW '.' ‘.‘K 'Rnu'm 5"}:J": '.IJ. 'II' 5"{I' ""71' . ' ":" '.'l‘ |' 'C " I"" I." i I I r ':' I. "s": ‘.'I"' '.‘ “:2; r'kl 1"ll . fi‘ IQII' :"I LI—fi' . I "'I '8 .55' ': "1"""" . "'. ?-""-"""" " . 5'“ ”5"": ""' I'"""l "' r." '3 ll!" .. 'I. .I'5 'H H5 . I' 1 , . II . 1.. . "."": ”‘.'?" II'H'I‘II '.':iIl‘I v!l"'||.. 51"“ ""i'l"," VIA ll. .I5 1‘1"“.th 3" ' IS ' _ . . é— 5,55 -. 5 nw~5 .... ,5;.5.55555 -. 5 5 5‘, I ' ,II'JH 5 II}, If,“ I n . 5 ' u' 1).} " , 5' I‘.‘ "5 ‘ I; '2‘ .‘I n . ..‘, H. ., . 55' '5' ','. '. "1'." 5" . I‘I' ‘ ’5?" oI'VI" ~. 5' 'l-I’I' $35 5;. -. ' "I": .. '.'flt': ‘ '.' . 2'1""? . 9515':- _ ‘;-- ""' ' I h': II '.’ '5'5 ' .' . ' V "‘.‘ ‘J o‘ . 'cfl'n'r":"".' LI ' "' ' ' “I" ’ ' ‘ ' .‘I'I “”55' ('l 5 ' I Q I L ' ' y— .“W ‘ { (”I'Lf" :fil'd'I-‘Nv I H ' i > | . I J 5,5” I. "5' :'5 I, ‘ .. . . .5" '.'??? ‘5:. ’:"II Ii." "'. 'L-I A" . 5' 5 U 5, - 4< 0 o ' 5 ' :J.fr\ "3' .0 ’ ' ' ' . ' . ,\ ' \." .IILW. ‘.’} .I "' I‘QK'J. b . _ 55 555I~55 . 5 «I.;- ..I.5-5I-gw, .., ‘".5II._ 5" 5 I o. ' .5 '5 5 3'5 I "I (I. "' ' 5 55:41“, ' ‘ ‘ ' .' ':"’ 5 ' ,I' 1.“ ' n ' ’ ' f ' ’5 a "" 'o“' ' I 1" I ”if?" '.':b“ " I f" u 'l ' 1' : f "f ' I I "l' 5"" 5" I I . 5 I '.. 0‘: [35.25 |"'"-' A' ,I 5' 0 U I '.' . "'. IO 5 ."5""""' '.. " . 55' I t' ‘ : . ' ("' {3'53"}. 5‘ :z',. : .II . ' ‘ ' it i ' I. ' .5],1 ’. . . r . , 0 :. I I ' I'I 'II5 ::o:l}‘. 1h); :\ ;:-::|'. 55' "$. ' :5 ... '.‘. 5 '.'I . '55.? I. 5'. I. 3555'». . '.f. '. 515-55.” 35"} I"""" ‘ ' “3- " ' 5 5 ( II 95 '."(I '55. ' "'I I. ' z ". \ ' T'. '. : '5', 5 . "(‘5' '55. ' " "'.' ! pt" I: ” " 5 ‘ 5 .' '. 5 5 I: 5“.;:' 5. . .' '.'“ , I .. 5. .5‘ '.. ‘5 “5 IElla... I': L. . I- a '.' ’I‘II'I15":' .. . . , , . . .5 .5 » . . . ...-r . ..5 ~ I. .555 .5 55' I . ',. " .5 "II' b ‘5'." "' ll " I',' .' "'"E'.'.'" “Us. "" ". .""I' " " I'"*"I ‘, ' :1 . g 0 ""1'. ,I' " .IO ‘ “g . ' I. . .f I ‘5'." III. II I..." III» 1 f'I'J, l . . 5 J ' 05 " u "5'" ""':\ In“) 5" ‘n-l ' ' 5". '. .."'."". -"" ""5"," ’1" ~ "215'”- "'5 ' .5" . . o . "b ' ".v ' "' 5"",I: 0 db ”WI-f ' {'5 ~' " ’.' ’3'" '5' . '1' "ts-'3')" " i I '5 . ' ‘5 «'."5'5. . ’. 5: , .5 'I 5 .A' .5 J55" .i.I..'I"-‘...II' ' .5 .. ..' ‘5‘5 ' ' I. . I I'll 5'I:"I'I 5. ' 5 5 5... II‘ . :5 ' ;','.'"" "°".".'. '1 . 5 ".". 5, 'I ":35'5 I ."' 'I "II . . 55 . 5 I I ""1"" ""I 31'.” .5 5 :1. 5 " .5. 5:?5 .. 5' 5' " .1’5' {I =' I"‘l'~":*"' 1:515 a . I 'I' . I 'b ¢ ;.:'~-I. "' 5.555.. ,. '; .. T'- ."" ,1' ' '.'T'IU-Hw' . M55551 I'quII '.'.,,J"I' : ..- . 5"‘J'I. "'5' 5.5515155 5.. 5.5. . 'IIEIIé‘vI-Iw -~.5 ".'."';I5 'Ii'"I""""" 5". '3"'I"I'I'J','}" I'.""'§'5: 5‘: 'I. 555 ' I'W‘J IL" """I" 5555., '.-:5:‘:...'i:2=55:-.~"5I:I'Ll 5.1.5 51.. 15‘. '..‘ .5555»: 5155555559"W.'2'55IW 15¢»- TH ESIS fin"; aka-(mm. .‘.‘.......__."'" u" f I t. ' L I R R A R Y ' University . 4'31. as.) ‘ This is to certify that the thesis entitled TDMP: A DATA FLOW PROCESSOR presented by George Henry Simmons has been accepted towards fulfillment of the requirements for Ph. D. degreein Electrical Engineering ,flu 2W Major professor Date February 10, 1981 0-7639 ‘» OVERDUE FINES: 25¢ per day per 1m ' l::\\\\ k ‘ . ,(éum L RETURNIE LIBRARY MATERIALS. ” ' 1 Place in book mtum to move w charge from circulation records TDMP: A DATA FLOW PROCESSOR By George Henry Simmons A DISSERTATION Submitted to Michigan State University in partial fulfillment of the requirements for the degree of DOCTOR OF PHILOSOPHY Department of Electrical Engineering and Systems Science 1980 I but" \_;,/ ABSTRACT TDMP: A DATA FLOW PROCESSOR By George Henry Simmons This research investigates a multiple processor computer structure that has a higher bandwidth than comparable single-processor machines yet is more flexible than existing fixed-array multiprocessors. The basic approach taken here was to develop, model, simulate, and then analyze a computer structure called TDMP - an acronym for Time Division Multiple Processing. TDMP‘s interprocessor communication network is based on time-division multiplexing and time-division switching techniques and has the following general properties: greater bandwidth than comparable single-processor structures; less complex switching network than a crossbar switch interconnection network; more flexible than fixed-array networks; full-access and non-blocking interconnection capabilities among the processors; simply extended to pipeline operation; and, finally, amenable to VLSI circuit implementation. The justification for this research investigation is five-fold: First, single-processor systems have intrinsic bandwidth limitations known as the 'von Neumann bottleneck.“ This bottleneck is due to the word-at-a-time style of processing through a single communication channel. As a result, computer structures based on this machine design can not exploit the inherent parallelism in specific tasks. Second, while fixed-array-processor structures can exploit this parallelism. their range of usefulness is limited by their fixed interconnection structure. This results in performance degradation when the task structure doesn’t match the physical structure. Third, Dennis, et al., have designed computer structures based on data-flow principles, but they have not explored in detail alternative interconnection networks suitable for George Henry Simmons data-driven computation. Fourth, time-division communication techniques have been successfully applied in the telecommunications industry but have not been investigated for use in multiprocessor structures. Yet this technique appears at first glance to hold good promise under certain overall computer system constraints. This fourth justification leads directly to the fifth. Namely, with multiprocessor structures implemented on VLSI circuit chips, the interprocessor and inter-memory communication paths require a disproportionate amount of chip area when compared with that required by the processors and their memory. So the communication channels determine a significant portion of both the cost in chip area and chip speed. With all of these considerations in mind, the TDMP structure was defined, and APL computer simulation model was developed. This model formally defines the hardware structure, as well as the timing, switching delays, and communication protocols. The model also serves as the basis for analyzing the characteristics of the computer structure and for testing its usefulness in handling various multiprocessor tasks. We limited the model to a sixteen-processor structure. Results of this investigation show that TDMP indeed has a higher bandwidth than comparable single-processor machines. Sample computations show that the specific. TDMP architecture considered has fifteen and ten times the bandwidths of a single-processor system when computing eight-point FFI‘ and second-order digital filter results, respectively. However, in order to achieve these bandwidth improvements over single-processor systems, the granularity of the tasks performed in each of the processors must be large. Results of this research also show that TDMP has greater flexibility than fixed-array multiprocessor structures. This is due to the non-blocking, full-access interconnection capabilities in the TDMP structure, which allows the data paths to be reconfigured for new application programs. Simulations also reveal that time—division techniques can be exploited to route information packets in a data-flow George Henry Simmons structure with simple operating system constructs. Moreover, these time-division communication techniques do significantly reduce the V151 circuit chip area devoted to data paths in data-flow structures. ACKNOWLEDGMENTS Never has one individual owed so much gratitude and sincere appreciation to so many peOple for help making a dream come true. Most notable of the many people is Dr. P. David Fisher, my advisor and thesis committee chairman. I would like to personally thank you for your guidance and support in achieving this ambitious task. You are a gifted professional and your contributions should not go unrecognized. 1 would also like to thank my thesis committee members for their suggestions and guidance in my doctoral program: Dr. S. Crouch, Dr. J. Forsyth, Dr. H. Hughes, Dr. J. Kreer and Dr. R. Reynolds. Above all, I would like to thank my wife, Grayce, and my son, Gavin, for the support, tolerance and encouragement they gave me while achieving this task. Chapter II. III. IV. TABLE OF CONTENTS Page INTRODUCTION 1 BACKGROUND 4 2.1 von Neumann Bottleneck 4 2.2 Multiprocessing Survey 8 2.2.1 llliac IV 8 2.2.2 Cmmp 10 2.2.3 PM4 14 2.2.4 Indirect Binary N-Cubc 16 2.2.5 Star-100 21 2.2.6 Cray-l 23 2.2.7 Systolic Arrays 23 2.2.8 Other Commercial Multiprocessors 26 2.3 Observations 26 TIME-DIVISION MULTIPLE PROCESSING 31 3.1 TDMP Architecture 31 3.1.1 PE Array 31 3.1.2 Time-Division Switching Network 37 3.1.3 Control Unit 42 3.2 TDMP Implementation 43 3.3 TDMP: A Data Flow Processor 44 TIMING CONSIDERATIONS 51 4.1 Timing Diagrams 51 4.2 Maximum Multiplexing and Demultiplexing 53 Rates 4.3 TDS Maximum Switching Delay 55 TDMP SIMULATION MODEL 62 5.1 TDS Simulation Model 64 5.1.1 TDS Control Unit 69 5.2 PE Simulation Model 74 5.3 TDMP Simulation Model Operation 83 iii Chapter Page 5.3.1 PE Operation 83 5.3.2 TDS Operation 86 5.4 TDMP Simulation Model Summary 90 V1. TDMP PERFORMANCE EVALUATION 92 6.1 Performance Data 93 6.2 Comparison of Computational Performance 98 6.3 Discussion of Simulation 107 6.4 TDMP and Dennis Data Flow Models 109 V11. CONCLUSIONS 111 7.1 Summary 111 7.2 Further Research 115 REFERENCES l 17 iv Figure 2.1 2.2 2.3 2.4 2.5 2.6 2.7 2.8 2.9 2.10 2.11 3.1 3.2 3.3 LIST OF FIGURES Alternative implementation of the 2nd-order difference equation -- non-von Neumann style processor. A block diagram of the structure of ILLIAC IV. A block diagram of the data paths between processing units in ILLIAC IV. A block diagram of the CMU Multiminiprocessor (Cmmp) architecture. A block diagram of the PM4 architecture. A block diagram of the indirect binary N-Cube array architecture. A block diagram of an individual switch node in the indirect binary N-Cube array. A block diagram of the indirect binary N-Cube array control system. A block diagram of the Star-100 data paths. A functional block diagram of the Cray-1 architecture. The hex connected systolic array for matrix multiplication. Block diagram of TDMP architecture. Block diagram of 121:] PE array and control unit. Block diagram of a TDMP processing element. Page ll 13 15 18 19 20 22 24 25 32 33 35 Figure 3.4 3.5 3.6 3.7 3.8 3.9 3.10 4.1 4.2 4.3 4.4 5.1a 5.1b 5.2 5.3 5.4 5.5 5.6 Example of incoming and outgoing PE communication paths. Example of time slot interchanging in the TDS. Example of both space and time switching in the TDS. Block diagram of a single input/output time-division switching network. Data fiow program graph of the 2nd-order difference equation. Example of a data flow branch operation. Example of higher-level conditional expression evaluation. Timing and control signals for a 16 time slot TDMP system. Matrix representation of the P and Q control signals. Flowchart for TDS control algorithm. An example to illustrate the A N versus AzLogzN switching delay issue. Block diagram of the TDS APL simulation model. Part of the TDS APL simulation model. TDS simulation model memory map. Block diagram of TDS the control unit. Addressing modes for the TDS network C-move processor. Control unit logic network specification table. Examples of TDS processor move operations. vi Page 36 38 39 40 45 47 48 52 54 56 59 65 66 68 70 72 73 75 Figure Page 5.7a Block diagram of the PE APL simulation model. 76 5.7b Part of the PE APL simulation model. 77 5.8 PE states for data flow computing. 80 5.9 Logic truth table for the outgoing part of the CILU. 81 5.10 Logic truth table for the incoming part of the CILU. 82 5.11 Flowchart of the PE control algorithm. 84 5.12 PE control program code. 85 5.13 Flowchart for the TDS control algorithm. 87 5.14 TDS control program code. 88 6.1 Block diagram representation of a single PE TDMP model. 94 6.2 Timing diagram for a two time slot model. 96 6.3 Signal fiow graph of an 8-point FFT. 99 6.4 Bandwidth comparison as a function of the average number of FFT operations. 103 6.5 Bandwidth comparison as a function of PE execution time for the FFI‘ computation. 105 6.6 Bandwidth as a function of multiplication time for the digital filtering computation. 108 vii CHAPTER 1 INTRODUCTION In many scientific and engineering computer application areas, the computer's computational speed limits its range of usefulness. Examples of such areas include energy and power model- ing, weather modeling and forecasting, fluid dynamics studies, computer-assisted tomography, and artificial intelligence.1 Several parallel approaches are being taken to extend the useful range of computers. The first approach involves reducing the switching delay times of elemen- tary gates, which embraces both the areas of device physics and integrated circuit (IC) technol- ogy.2 With the second approach, the switching delays are assumed to be fixed but improved methods are sought for performing primitive single-operand and double-operand arithmetic operations, such as transcendental functions, integer arithmetic, and floating-point arithmetic.3 The third approach assumes that the primitive arithmetic operations are given; instead, the focus is on the design of computer architectures that overcome the intrinsic speed limitations of the von Neumann machine.4 The first approach is inherently limited in that the effects of reduced IC dimensions and the introduction of new logic families is expected to reduce gate delays and, consequently, improve overall computer system performance by only a couple of orders of magnitude.1’5 While this represents an important improvement, it does not in itself achieve the long-range improvements needed. Gains with the second approach will result largely due to decreased cost per bit and increased density per bit of semiconductor memories and logic arrays. Older algorithms for primitive arithmetic Operations will be modified to fully exploit speed improvements possible with ROM look-up tables or the advantages of performing concurrent conditional computations, such as conditional sum addition.3’6 While the third approach is ultimately limited by gate delays and the speed of primitive arithmetic algorithms, it provides the potential for making the largest increase in computational speed. Here, multipro- cessors are employed; and these multiprocessor architectures exploit the inherent parallelism in specific tasks, thereby eliminating the single processor bottleneck--'von Neumann bottleneck."4 We will restrict ourselves to this third approach. Moreover, we will restrict our attention to a class of multiprocessor architectures that can be implemented on a single very large scale integration (VLSI) chip or a small number of such interconnected chips. Historically, high-performance multiprocessor computer systems have been very expensive, special purpose, and generally research-oriented tools. V1.81 technology is changing this so as to make it feasible to build high-performance multiprocessor structures as low-cost, single-chip system components. VLSI technology is a statement about system complexity, not about transistor size or circuit performance. VLSI defines a technology capable of creating systems so 2 From this complicated that coping with the raw complexity overwhelms all other difliculties. definition, we can see that the way in which the computer industry designs multiprocessor com- puters in VLSI technology must, in fact, be different from the way it has traditionally designed computers in other technologies. For example, in VLSI technology the transistors will be almost 'free' and the interconnection data paths - communication - will determine the cost in 2 This is true because the interconnection paths in VLSI tech- both area and speed of the chip. nology are the same width as a transistor, which means that these paths reduce the available active chip area. So, for VLSI implementation of multiprocessor structures simple and regular underlying communication geometry is required to reduce the total amount of interconnection path lengths on the chip. In addition, the processors should be identical to reduce the layout time and effort of the architecture, the design should be partitionable into segments of manage- able size and these designs should have a wide range of use to justify the costs for a component manufacturer. As a result, this work is based on the following premise: Continued advances in integrated circuit fabrication technology will permit chip complexities to increase more than three orders of magnitude over what they were at the end of 1979.7 This research investigates an alternative single-chip microprocessor architecture that exploits this technology to improve significantly the computational capabilities and general usefulness of small computer-based sys- tems suited for signal processing computations such as waveform generation, modulation and filtering. Specifically, an alternative multiprocessor structure, called time Division Multiple Pro- cessing (TDMP), is investigated and has the following general properities: — greater bandwidth than comparable single-processor architectures; — less complex switching network than a crossbar switch interconnection network; — more flexible than fixed-array network; — f all-access and nonoblocking interconnection capabilities; — simply extended to pipeline operation; — highly compatible with data flow algorithms; — amenable to VLSI implementation. The TDMP structure is evaluated by comparing its performance to a single-processor archi- tecture as one boundary of performance and to a fixed-array-processor architecture as the other boundary of performance. Bandwidth, hardware complexity, flexibility, and regularity are the four principal figures of merit. Chapter 2 of this thesis contains a brief review of several key existing multiprocessor struc- tures, including their performance and range of usefulness. In Chapter 3, the organization and operation of TDMP is presented, along with an analysis of hardware complexity. Chapter 4 presents timing diagrams, closed-form expressions for the maximum multiplexing and demulti- plexing rates and the maximum switching delays. Chapter 5 describes the TDMP simulation model. In Chapter 6, the simulation results of two applications are given. The first is a fast-Fourier-transform and the second involves a digital filtering computation. We also com- pare the computational bandwidth among the TDMP system, single-processor system and fixed-array-processor system for these two applications. Finally, Chapter 7 gives a summary of this research as well as some suggestions for future research. CHAPTER 11 BACKGROUND 2.! von Neumann Bottleneck Single-processor computer systems have intrinsic speed limitations that have been ascribed to the 'von Neumann bottleneck." The term 'von Neumann bottleneck" was coined by Backus-I to represent the word-at-a-time style of processing that is characteristic of von Neu- mann machines--the model computer conceived by von Neumann and others about 35 years ago.7’8 The von Neumann computer is composed ideally of a central processing unit (CPU), a memory that contains data and instructions, and a connecting channel that can transmit a single word at a time between the CPU and the memory (and send an address to memory). The con- necting channel is where the von Neumann bottleneck occurs. The reason for its name is that all computational tasks in the von Neumann machine must be accomplished entirely by pump- ing single words back and forth through this connecting channel. A large part of the traflic in the bottleneck is not useful data but merely names of data, as well as operations and data used only to compute such names. As a result, single-processor bandwidth, defined as the maximum throughput measured in terms of the maximum number of results that can be generated per unit time, is always limited by the von Neumann bottleneck as the computational operations get sufliciently larger or complex. An example will serve to illustrate the issue. In digital filtering, a group of operations is performed once for each sample (in time) of the signal being processed. For purposes of this calculation, assume that the processor is a 16-bit microprocessor with a 200 nsec cycle time and the floating-point multiplication, division, addition and subtraction operations are performed in a hardware coprocessor.9’lo Single-precision, floating-point addition and subtraction are per- formed in 14 nsec and 18 nsec respectively, and double-precision extended multiply and divide operations are performed in 27 nsec and 39 nsec, respectively. If the second order recursive filter Y(2) _ A0+A.z"' ”(2) - Km 1 —a.:" — 15:22“2 (1) is implemented with this processor as a difl'erence equation of order 2 with constant coefficients Y(n)=AoX(n)+A,X(n—1)+B,Y(n-1)+BzY(n-2), (2) then the maximum filter bandwidth is limited to approximately 3.3 kHz. This bandwidth is fixed by the minimum time it takes the processor to perform the group of operations for each sample of the signal. The group of operations consists of one-at-a-time execution of four multiplication and three addition operations. For simplicity, we assume it takes zero time to move data and to fetch instructions from memory. In this case, the minimum execution time is, Tm,“ = 41” + 3:,4 - 4(27 nsec) + 3(14 nsec) = 150 nsec where t” and 1,, are the floating-point multiplication and addition times, respectively. There- fore, for every Tum, units of time a group of operations is completed, this corresponds to a pro- cessor bandwidth of (Twin)"' - 6.6 kHz and a maximum filter bandwidth of (213m)" - 3.3 kHz. If the order of the filter gets higher, the number of group operations incrcascs; consequently, the processor and maximum filter bandwidth decreases. The shrinking maximum filter bandwidth is caused by the von Neumann bottleneck and limits the processor’s usefulness in many filtering applications, such as those found in telecommunication systems, where the maximum filter bandwidth requirement is 4 kHz and the filter order is 5.11 The von Neumann bottleneck can be eliminated by exploiting the inherent parallelism in a group of operations. For example, if the group! of operations for the second order difference equation in our previous example is implemented as shown in Figure 2.1, the processor bandwidth is significantly improved. Three successive independent stages of computations are executed in parallel and the output of one stage feeds the next, analogous to an industrial assembly line. If we assume a continuous input stream of values, zero time to move data between stages and ignore any start-up times, the minimum execution time, TL“, - maxlt,,r2,13} - speed of the slowest stage in the system, where r. = r”, I; - IA and, t3 = (,4. Processor bandwidth is -tl- - 37 kHz because every Tim = r”, u a result can leave the system. This determines a maximum filter bandwidth of BW- 1. - l -18.5kHz. 2Tmin 2’” A direct comparison shows that 5.67:“,n - Tum; hence, the non-von Neumann style processor has a five-fold processor and maximum filter bandwidth improvement over the von Neumann style processor. This improvement was achieved by eliminating the one-operation-at-a-time style of processing through a single channel--the 'von Neumann bottleneck.” Recognizing that the von Neumann bottleneck limits processor bandwidth, we must develop processor systems that compute larger units of the task at hand. To accomplish this, multipro- cessing concepts are employed in new processor architectures to improve bandwidth. Two widely used forms of multiprocessing are parallel processing and pipeline processing. Parallel processing improves processor bandwidth by using many processors operating in paral- lel, either on difl'erent data sets or on difl'erent portions of the same data set.12’l3 Since, in fact, the processors are not always independent, they may require access to the same data or interchange of results between processorsm’15 Flexible interconnection networks are needed X(n) Y(n-1) Y(n-2) Stage 1 Stage 2 + + Stage 3 + ' '— Y(n-1 ) Y(n-2) #- DELAY DELAY V Y(n) Floating-Point multiply unit Floating Point add unit lLlH DELAY Register unit Fig. 2.1 Alternative implementation of the 2nd order difference equation -- non-von Neumann style processor. here to efficiently handle the communication problems. By flexible interconnection networks, we mean networks that can change their data flow pattern under program control. It turns out that most applications are sped up by transforming their traditional sequential algorithms to use the multiprocessor structure, not by executing on a processor with ever shorter memory access times. ‘6 Pipeline processing, on the other hand, improves processor bandwidth by subdividing the processing to be done into sequential functions and then assigning a processor to each function. These functions are then arranged in a 'pipeline" so that the output of one stage feeds the next.13 ’17’18 The processor bandwidth of this approach is limited by the speed of the slowest processor in the pipeline if fixed interconnection networks are used to feed the pipeline stages. By a fixed interconnection network, we mean a network whose data flow pattern cannot be altered. Jump presents quantitative techniques for the evaluation and comparison of digital sys- tem pipelines.l7 2.2 Multiprocessing Survey In this section, we briefly review several key multiprocessor structures in order to ideptify their multiprocessing attributes. We also suggest how well suited these attributes are for VLSI implementation for enhanced computation. Unless dictated by the need for under- standability, we have avoided material not directly related to the multiprocessing aspects of these systems. 2.2.1 llliac IV. llliac IV is a single-instruction stream multiple-data stream (SIMD) experi- mental computer designed at the University of Illinois in the late 1960‘s.19 The general struc- ture of llliac IV is shown in Figure 2.2. It contains 64 identical processing units (PU) with a common external control unit (CU), a four nearest-neighbor interconnection structure, an interface to a supervisory host computer and a switching network for interchanging data and instructions between the CU, host computer and memory. A PU comprises a processing I P-ih :- .-:L_Q" . -—— I , __ L" "1 ”'1 l4— . ‘ —i i t 'r---1 ‘--- -> PE F‘“ "—" r f .L_£_H—_'_"2_h—— .——. CONTROL ...4 mm . I . ‘ ‘ I SHITCH I l TI'TI‘ I ' t , supv. L... p563 —-'i p” ‘-~ conpurra ~ MEMORY , n—1 53 ___‘ Fig. 2.2 A block diagram of the structure of ILLIAC IV. 10 element (PE) and a processing memory (PM). A PE is a general-purpose arithmetic logic unit (ALU) capable of executing a conventional instruction set that includes 64-bit floating-point operations. Each PM has a capacity of 2 k, 64-bit words.”’19 The control unit fetches instruc- tions from a processing element’s memory, decodes them, and issues control pulses to the pro- cessing elements 'for execution. It broadcasts memory addresses and data words when they are common to all processors. An instruction can be either a control or a processing unit instruc- tion. The former directs operations local to the control unit, whereas the latter controls the execution of the processing units. The control is designed to overlap the executions of the two different instruction types.4 In addition to the common data and control buses that link the PUs to the CU, there are direct data paths connecting each PU to four neighboring PUs. Specifically, PU,- is connected to PU} if j -= i +1 (modulo 64), j - i- l (modulo 64), j - i + 8 (modulo 64) or j = i - 8 (modulo 64). The PUs form a two-dimensional array, Figure 2.3. For this reason, Illiac IV is often referred to as an array processor.4’l3'20 The array organization is very eflective in exploiting parallelism when the characteristics of the problem to be solved match the physical structure of the array."12 Matrix operations pro- vide an example of this kind of problem.12 The uniform processors and simple, regular com- munication paths satisfy VLSI implementation requirement.21 A disadvantage of the array organization is the inflexibility in the interconnection structure. This results in performance degradation when the problem structure does not match the physical structure of the array and reduces the universal appeal which brings about some high pressure constraints for VLSI imple- mentation.4’12’22 The failure of a single processing element can hamper the operation of the entire system; however, by adding alternate data paths in each processor, similar to a Chordal Ring interconnection network, the system would have a graceful degradation.23 2.2.2 Cramp. Cmmp is a multiple-instruction stream multiple-data stream (MIMD) 11 «- Wsa -- PU if ‘- pu ‘- G-l 55 -u- p”56 — P”57 p”58 .. p”63 PUo Fig. 2.3 A block diagram of the data paths between processing units in ILLIAC IV. 12 multiprocessor computer designed and built at Carnegie-Mellon University for computer archi- tecture and artificial intelligence research work.14 The hardware and software were designed with the goals of symmetry and general purpose in mind. By symmetry, we mean that repli- cated components, such as processors, are treated as an anonymous pool; no one of them is special in any sense. By general purpose, we mean the multiprocessor character of the machine is used to improve throughput across a set of independent jobs as well as to multiprocess single jobs.16 The system consists of 16 PDP—ll minicomputers connected to each other through an interprocessor bus and individually connected to 16 memory modules through a full-access, non-blocking, crossbar switching network. Each minicomputer operates like an independent processing system with its own primary memory and controllers. The interprocessor bus allows any processor to generate an interrupt to any subset of the processor configuration at any of several priority levels. No data is carried by this bus. The crossbar switch allows any processor to establish a path to any memory module--full access and all permutations of individual pro- cessors connected to individual memory modules are possible-~non-blocking. The switch is under both processor and manual control. Collectively, the 16 processors execute six-million instructions per second; the total memory bandwidth is about 500 million bits per second. Despite the fact that Cmmp is built from minicomputers, it is a large-scale machine. Figure 2.4 shows'the basic structure of Cmmp along with other system components. The other system components include another crossbar switching network to allow any processor to communicate with any of the various controllers which manage secondary memories, and input/output dev- ices. The basic design goals of Cmmp were achieved, and the research revealed some important observations about multiprocessor systems. Specifically, that the raw speed in the design of the switching network and the processor is not very important but reliability of the system is very important.16 The crossbar switching network provides full-access and non-blocking interconnec- tion capabilities for processor communication. Some criticisms of the system are: the hardware 13 16 x 16 Cross Bar Switch r--------------- ----- -1 l—l . ' Mg" 1‘\ .1'\ 1", ' ,_1__ . ‘l’ ‘l’ ‘1’ . I 2 I t ' l 39‘ 'sl Ax AK ' 14 I VI \1 \J l ..._..J I 1 ""'” I I ”9‘ rx {x 15 15 I‘\J ‘NJ‘ 1 I .___l 1 H, I HEM ' Xx [x rx ' 16 ' \J V1 ‘H ' L a - - - - - ............... J L Li. H .‘IEN EM p HAP 1A? F'_ ‘ F“ 1r 1 ICPI RC 43361) was] l;] Cross-Bar-Switch K Input/Output Controller Minicomputer (POP-ll) Memory Interrupt control PE 3 III. KC Fig. 2.4 A block diagram of the CMU multiminiprocessor (Cmmp) architecture. 14 is less reliable than desired, they were unable to partition Cmmp into disjointed systems, and there is not enough human engineering into the software interface to the user.16 The N2 growth of the crossbar networks make this architecture a poor candidate for VLSI implementa- tion. The crossbar connecting paths and switches use substantial amounts of the chip area, thereby reducing the size of a network that can be implemented on a given chip.24 2.2.3 PM4. PM4 is a reconfigurable multiprocessor system for pattern recognition and image processing research work and is currently under development at the Advanced Automa- tion Research Laboratory of Purdue University.15 The system envisioned consists of hundreds of Large Scale Integration (L81) bit-slice microprocessors and a three-level hierarchical memory connected by a set of interconnection networks.15’25 Figure 2.5 shows a basic block diagram of PM4. The system consists of N identical proces- sors with local memory (PMU), K identical vector control units with local memory (VCU), shared memory connected to the processors through a delta interconnection network (PMIN), file memory connected to processors by a shared bus and a yet undesigned interprocessor con- nection network which permutes data among processors (IPCN). The system can reconfigure its resources under system control to assume four different operation modes: 1. SIMD MODE - Single-Instruction Steam and Multiple-Data Stream (SIMD), Illiac IV.19 In this mode, the same instruction is executed by a subset of the processors operating on difl'erent data streams. This mode is used for vector operations with the vector control unit. 2. Multiple SIMD Mode - In this mode, a multiple number of SIMD operations are executed in parallel. 3. MIMD Mode - Multiple-Instruction Stream and Multiple-Data Stream (MIMD), Cmmp.M 15 L» ------J------ H Bus --------1 ------J Delta Network Processor-Memory Interconnection Network (PMIN) S ML" so Shared Memory SH L-l,H-l File Memory Control Unit File "Emory Fig. 2.5 A bIOck diagram of the PM4 architecture. 16 The processors perform independent tasks on separate data streams concurrently. 4. Distributive Mixed Mode - In this mode, SIMD vector instructions and parallel MIMD processes are simultaneously executed. The Vector Control Units (VCU) are used in the SIMD mode of operation. The VCU broadcasts instructions on the vector control bus to all processors that are assigned to the SIMD process. The VCU may also send control signals to a time-shared Interprocessor Communica- tions Network (IPCN) to switch the data in a group of processors. The IPCN is time-shared among VCUs during multiple SIMD processes. The VCU has the ability to mask or disable processors so that only the active or unmasked processors execute the broadcasted instructions. During, execution, a multiplexor is used to route broadcasted instructions from a VCU to a pro- cessor. A delta interconnection network is used to connect the processors to shared memory 26,27 for block transfer of information. PM4 architectural features are based on existing machines such as Cmmp and llliac IV.”’19 The architectural advantages of the above systems have been incorporated into PM4. The flexi- bility in the PM4 architecture allows it to overcome some of the architectural disadvantages in existing MIMD and SIMD machines. However, because of the processor speed requirements for swift reconfiguration and communication geometry (three different interconnection net- works), a single or couple chip V151 implementation of PM4 may not be applicable. Micropro- cessors such as Motorola 6800, 15111, Intel 8086, and 28000 do not meet PM4’s speed requirements and neither will V151 processors, since they are likely to be of only moderate speedw’28 V151 performance measures indicate that chip area requirements for delta networks are the same as those for crossbar networks.24 The difliculties of V151 implementation of crossbar networks were discussed in Section 2.2.2. 2.2.4 Indirect binary real». The indirect binary n-cube microprocessor array is a multipro- cessor architecture in which processors are interconnected by a switching network whose set of 17 connections can be described by the set of edges of the binary n-cube.28 It is called indirect because the array is not actually interconnected according to the topology of the binary n-cube. The basic form of the array is illustrated in Figure 2.6 for n - 4, N '- 2‘ = 16. The circles in Figure 2.6 represent the microprocessors, indexed from 0 to 15 as indicated by the numbers in the circles. The lines on the right from the switching network connect back to the microproces- sors with the indices given in parentheses. Each switch node, indicated by the squares in Fig- ure 2.6, has two input lines, two output lines and can be put into either of the two states shown in Figure 2.7, providing a “direct“ or a "crossed” connection. Each level of switching represents a dimension in the n-cube; in this case, n - 4. The network is used to permute data among microprocessors. In some cases, it may be necessary to make multiple passes through the network to obtain permutations of the data that are otherwise unrealimble. Multiple passes will, of course, entail a sacrifice of speed, but give added flexibility. In addition, as the n-cube dimensions get larger, the number of switching lev- els in the array increase, which also sacrifices speed in the system. The system has a two-level control system for the microprocessors, based on variable microprogramming stored within the microprocessors. This means that global commands sent by the main control unit to the microprocessors may be interpreted difl‘erently by each microprocessor. For the switch nodes a set of switch controllers are used to control a set of switch nodes. These switch controllers receive global commands from the main control unit. The control system is shown in block diagram form in Figure 2.8. This processing array can be used effectively for a broad range of SIMD applications. The regularity and modularity of its structure makes it an attractive candidate for V151 implementa- tion. However, the range of application of the array is limited by the structure of the binary n-cube. The binary n-cube does not have full-access and non-blocking capabilities.28 In addi- tion, the complexity of implementing indirect binary n-cube networks in V151 technology is the 18 \/ l [ Level 3 Level 4 SN 5n (0.21 10.3) 9:11 a r .le A U! o O A as O O ,L/m— :J/n a: as arl E? as a? >4 _ ><>< Fig. 2.6 A block diagram of the indirect binary n-cube array architecture. \ us b ‘—’ \J 19 81— —l—ao bi — — l_ ho (a) direct ai _—‘ I_ 30 bi -— ‘_ b0 (b) cross Fig. 2.7 A block diagram of an individual switch node in the indirect binary n-cube array. 20 l Controller l i j J SW3 tCh e o 0 Switch Contro1ler Controller ‘ — 0 0 Fig. 2.8 A block diagram of the Indirect binary n-cube array control system. 21 same as that for crossbar networks.24 2.2.5 STAR-100. The Control Data STAR-100 computer is a high-performance pipeline processor structured around a four-million byte, high-bandwidth memory.20’29 Instructions specify operations on variable length data streams. A block diagram of the STAR-100 memory-pipeline data paths is shown in Figure 2.9. The core memory and the data bus configuration have been designed to support a pipeline rate of 100 million 32-bit floating-point operations per second. The core memory has 32 interleaved banks, with each bank containing 2 k 512-bit words. This memory system can support 512 bits of data per minor cycle and there are 32 minor cycles. The width of the memory data bus for each group of four banks is 128 bits. The data bus transfer rate is 128 bits per minor cycle. Four buses are active with each bus transferring data at a rate of 128 bits per minor cycle. Two of the buses are used for transferring operand streams to the pipeline processor. The third bus is used for storing the resulting stream ele- ments and the fourth bus is shared between input/output storage segments and control vector references. The read and write buffers are used to synchronize the four active buses. The memory segments are bufl’ered to space them eight banks apart, eliminating memory conflict situations. The floating-point arithmetic section of the ST AR-100 consists of two independent pipeline processors. Processor] consists of a pipeline floating-point addition unit and a pipelined floating-point multiplication unit. Processor 2 consists of a pipelined floating-point addition unit, a non-pipelined floating-point divide unit and a pipelined multipurpose unit which is capa- ble of performing a floating-point multiplication, divide, or square root Operation. The memory interleaving and pipeline processing makes STAR-100 very efficient for pro- cessing vectors. Furthermore, its overall system design allows scalar processing as well. How- ever, with respect to flexibility, the architecture is non-flexible and applications must be 22 r' """" ‘1 Pipeline Processor 30 Memory Banks 128 Bits x 8 Banks 0-31 Containing 4M Bytes Stack and A.B.C.D ' 128-Bit Buses Fig. 2.9 A block diagram of the Star-100 data paths. 23 structured around a vector for most eflicient performance. The long start-up times are seen as a disadvantage and the communication network between the processors and memory banks--wires--is seen as a disadvantage of V151 implementationz‘20 2.2.6 Cray-I. Cray-1 is a very high-speed function-oriented general purpose computer built by Cray Research Incorporated and is capable of processing 80 million instructions per second.20’30 Both scalar and vector processing capabilities are incorporated into its design. A block diagram of the Cray-1 is shown in Figure 2.10. The main memory can be up to one mil- lion 64-bit words of 50 nsec cycle-time bipolar memory. The memory is 16-way interleaved so the CPU can easily achieve a data transfer bandwidth of one word per clock cycle. The 1/0 sys- tem consists exclusively of channel connections to other computers and channel connections to high-speed permanently mounted disks. Cray-1 is designed to extend the independent functional unit concepts developed in early CDC 6000 and 7000 series equipment.20 While the system avoids some setup problems found in the STAR-100, its architecture suffers from the fixed vector length. Vector chaining is used to circumvent the fixed vector length problem.30 However, the machine will not run efliciently if full task switching is done very frequently and special designs for each functional unit are not conducive to a single chip V151 implementation.2’3’o 2.2.7 Systolic arrays. Systolic arrays are special-purpose, high-performance multiprocessor devices. A systolic system is described by a network of processors which rhythmically compute and pass data through the system. Every processor regularly pumps data in and out, each time step performing some short computation, so that a regular flow of data is kept up in the net- work.2 1 Many basic matrix computations can be pipelined on systolic arrays composed of many inter- connected inner product step processors. Frgure 2.11 shows a hex connected systolic array for matrix multiplication. An inner product step processor is a processor that performs the 24. Program Control Section I’ Program Counter ’4] ’ I I I n I (4 Parallel /’ Memory and 1/0 Section Paths) InterrUP .lnétgugtignjuiffiri - - - - - ’ I_ _ Functional Units' Register Section H {/0 in. [_yector Length I ‘ ‘ . I Memory . l I 0 l K _ l , I, ‘nteoer Add : A . p ' A Registers Inte.rupts M 1 [_ integer u t ' Address Buffer Re isters L Pop Count fil—H __.g.. T ‘ ir— . Address . ' Registers . Real Time Clock ‘r— ' Integer Add I ‘ T ‘ I L. .Shifr : l I Locrcal ' S I [ Pop Count V1.4— r Buffé' Multiply Vector Ma5k Recaprocal ApprOximation Unit [ Integer Add 1_. soft I Lgoical ‘ Sfialg ' 1. e9 sters .1 ._.E12%§12§,599..._ if ca ar eating I I I hit- I I I I I I —-----i-- W E Vector Registers Fig. 2.10 A functional block diagram of the Cray-1 architecture. 25 Fig. 2.11 The hex connected systolic array for matrix multiplication. 26 computation, RC - RC + R4 1: R. and makes the input values for RA and R, together with the new value of RC available as outputs. Kung gives detailed examples of matrix computa- tions on systolic arrays.2| The systolic computations are characterized by the strong emphasis upon data movement, pipelining in particular. The arrays have simple and regular communication paths and almost all processors are identical. This makes systolic arrays attractive for V151 implementation. How- ever, the regular communication paths are obtained by sacrificing flexibility in the network. The rigid data flow paths are not reprogrammable and their structure depends on the computa- tion problem. As a result, a unique array must be provided for each difl’erent computation problem and, if the array does not have universal appeal, then the use of such a V151 part in a new architecture brings about some high pressure constraints.22 2.2.8 Other commercial multiprocessors. There are other multiprocessor systems besides the ones we have reviewed: IBM 370/168, CDC Cyber I70, Honeywell Series 60 level 66, Univac 1100 Model 80 and Burroughs B7700, to name a few. Each of these multiprocessing systems has distinct advantages and unique constraints with implications for performance. Some of these systems have architectural organizations similar to the previously reviewed sys- tems. The Cyber 170, for example, embodies the principle of functional partitioning similar to the Cray-l system. A survey which highlights some of the architectural strategies of the above commercial multiprocessor systems is found in reference 31. 2.3 Observations Our brief review of several key multiprocessing architectures provides us with a basis for making some general comments about multiprocessing systems. One central issue in their design relates to the networks over which the multiple processors communicate to other processors or memory modules. Clearly, as the number of processors increases, the charac- teristics of this communication network become critical to overall system performance, cost, 27 and reliability. For example, Cmmp uses crossbar switching networks for communication pur- poses. An N-by-N crossbar switch has full-access and non-blocking interconnection capabilities. However, the difficulty with crossbar is that network costs grow with N2. Given V151 perfor- mance bounds, crossbar networks are infeasible for single-chip multiprocessor systems. Pease’s indirect binary n-cube microprocessor array uses the binary n-cube switching net- work and Purdue’s PM4 uses a delta switching network. Both of these networks have similar complexity and cost. The number Of switch nodes in these systems grows with Mag; N; and, if implemented in small scale integration ($81) or medium-scale-integration (MSI) technologies, their network costs are cheaper than crossbar. But, if these networks are implemented in V151, their network costs are comparable to crossbar.24 Binary n-cube and delta networks are block- ing, which means under certain conditions messages going to difl'erent output ports will require use Of the same path between two switches. Since, under the assumed protocol, only one mes- sage can hold a given path during message transmission, blocking will occur. This blocking reduces the bandwidth and application range Of the networks and introduces added delays in the system Operation. Other multiprocessor systems such as Illiac IV and systolic arrays have even more restricted communications networks. Illiac IV uses a four-nearest-neighbor interconnec- tion and systolic arrays have fixed data path interconnections. This results in performance degradation when the problem structure does not match the physical system structure and the utility of such systems is limited by their specificity. The question thus arises, what type of interconnection network can be placed on a V151 multiprocessor chip that will enhance cOmputation and prove the multiprocessor chip useful in a large number Of applications? In Chapter 3, Time Division Multiple Processing (TDMP), we suggest time division switching networks as an alternative solution to this problem.32 Time-division switching networks have full-access and non-blocking interconnection capabilities like crossbar networks, use very few data paths, and the network costs are dominated by the 28 cost-per-bit of semiconductor memory. However, no single network is generally considered 'best' since the cost effectiveness of a particular design varies with such factors as the computa- tional tasks for which it will be used, the desired speed of interprocessor data transfers, the actual hardware implementation of the network, the number of processors in the system, and the cost constraints on the construction.33 The processors used in a multiprocessor system may or may not be identical. The Cmmp, Illiac IV and systolic arrays are examples of systems with identical processors. The Cray-1 (in which the independent functional unit concepts are employed) is a system with difl'erent proces- sors. It is essential that the processors used in a V151 multiprocessing system be identical modules organized in a simple, regular fashion with a minimum number of connecting paths between modules because such geometry leads to high density and, more importantly, to modu- lar design. The TDMP system adapts to these V151 requirements by using identical processor-memory modules connected to time-division multiplexed buses. Another issue in the design of multiprocessor systems is their performance. lnvariably, the scaling in performance of a multiprocessing system is sublinear. That is, using an N-processor multiprocessing system always yields less than N times the performance of the corresponding single-processor based systems. This is true for both N-unit parallel and N-unit pipeline archi- tectural organization of multiprocessor systems. One the one hand, we have multiprocessor architectures, like Cmmp, that are very flexible and have a wide range of application. However, this flexibility is at the expense of additional software and hardware operating system overhead needed for communication and control. On the other hand, we have multiprocessor architectures, like systolic arrays, that have very high performance but limited range of application. The very high performance architectures have low operating system overhead, but this low overhead is at the expense of fixed interconnected data paths between processing elements to enhance the computation for a specific problem. SO, 29 in our investigation we seek a balance between these two extremes with a multiprocessor archi- tecture that has performance and flexibility bounded by a general purpose architecture like Cmmp, on one hand, and a non-flexible special purpose architecture, like systolic array, on the other. We also seek an architecture that allows a simple software interface to the user. Con- ventional architectures speed up most applications by executing algorithms that have been transformed to use the multiprocessor structure or by fixing the transformed algorithm into the interconnection wiring of the multiple processors. Both approaches have proved successful in many ways; however, in every case permitting large amounts of parallel activity, it has proven far more difficult to obtain parallelism in software than to provide it in hardware. In view of the nature of parallel hardware systems and the practical difficulties of keeping them running at full speed, we investigate an architecture that executes data flow programs. Data flow programs are seen as a natural reinterpretation of conventional programs with parallel execution in mind. This approach abandons the classical instruction driven computing. The underlying problem with most current attempts to use parallel hardware is that they are based on traditional con- cepts of programming.36 These concepts in turn are based wholly on the serial von Neumann computer design, with instructions executed one at a time. In particular, use of a program counter remains obligatory. In instruction pipelines, no attempt is made to alter the basic von Neumann model. In vector and array processors, one instruction may operate on many pieces of data, but only one instruction executes at a time. In multiprocessors. many program counters step thrOugh subprograms simultaneously presenting complex problems of communi- cations such as memory conflict. The use of a program counter is inappropriate when programs are intended for parallel execution. Efforts to develop a model of computation which can efl'ectively express parallelism have yielded a form of program representation known as data flow. Execution of a data flow program is data-driven; that is, each instruction is enabled for execution just when each required operand has been supplied by the execution of a predecessor instruction. Dennis and Misunas, Gurd and Watson have done work on the design of 30 computers based on the data flow concepts.“’39 The system we investigate is a new, simpler implementation based on some of their work. CHAPTER III TIME-DIVISION MULTIPLE PROCESSING In this chapter, we present the Time-Division Multiple Processing (TDMP) architecture and examine its hardware complexity. It is called 'time-division' because the transmission and switching of information among the multiple processors is done with time-division techniques; i.e., time-division multiplexing and time-division switching. These techniques are useful in meeting the interconnection wiring and pin constraints imposed by V151 design while enhanc- ing overall arithmetic computation. The key to TDMP is the integrated transmission and switching system that provides the communication channels among the processors. After describing TDMP, we very briefly discuss implementing it using V151 technology. The intent here is to show that TDMP is very compatible with V151 implementation. Then we consider TDMP as a data flow processor and investigate how it executes programs expressed in data flow notation. 3.1 TDMP Architecture The basic TDMP organization is illustrated in Figure 3.1. The components consist of an N x M array of processing elements (PE), time-division switching networks (TDS), and a control unit (CU). We will give a brief description of each component and its interrelationship with the others. 3.1.] PE array. The basic structure of the N x M PE array is shown in Figure 3.2 for N - 12, M - l. N represents the number of PEs that share a unique pair of two-way incom- ing and outgoing time-division multiplexed (TDM) buses. The number in each PE box gives the PE time slot position on these buses. M represents the number of unique pairs of incom- ing and outgoing TDM buses in the array. The incoming buses are shown as two-way com- munication lines to the left of individual PEs and the outgoing buses are shown as two-way communication lines to the right of individual PEs. The remaining lines are control lines that 31 32 ‘1 Br Incoming Outgoing Receive Port Receive Port CONTROL UNIT Outgoing Incoming Transmit Port Transmit Port Input Input Outgoing Incoming TDS TDS Output Output A0 B0 Incoming Outgoing Receive Port Receive Port N x M PROCESSING ELEMENT ARRAY Outgoing Incoming Transmit Port Transmit Port Fig. 3.1. Block diagram of the TDMP architecture. 33 .uwcs Houucoo cam xmuum mm aqu mo Emuwmam xoon N.m .wwm man also? 9.2.85 4 25 3623.3 wad-50cm % mm III. NH m age: Houucoo e c mac uulocmuu wcuowuao 34 convert the PEs into time-division multiplexed circuits for transmitting and receiving informa- tion packets consisting of flag bits, switching addresses, destination addresses, operand tags, operands and PE signaling information. All PEs operate independently, are identical (see Fig- ure 3.3) and consist of a central processing unit (CPU), memory and a communications inter- face logic unit (CILU). The CPU has arithmetic and logic facilities to execute instructions stored in memory. However, these instructions are not enabled for execution until all data operands have been received from predecessor PE operations. The communications interface logic unit (CILU) interfaces the CPU to the incoming and outgoing transmit and receive TDM buses, Figure 3.3. The incoming transmit and receive TDM buses are used for communication purposes in receiving packets from other PEs. The outgoing transmit and receive TDM buses are used for communication purposes in transmitting packets to other PEs. Both communication arrangements are illustrated in Figure 3.4. Incom- ing packets contain data needed for the next computation and outgoing packets contain results of a previous computation. Each CPU independently coordinates its use of these buses through its CILU. The CILU consists of an addressable information packet buffer connected to each TDM bus and control logic with control inputs from the CPU and timing signal inputs from the control unit. The CPU can read from bufl'ers connected to receive TDM buses and can write into buffers connected to transmit TDM buses. The timing signals from the control unit are used to convert these bufl'ers into time-division multiplexed circuits. As a time-division multi- plexed circuit, each bufl'er is assigned a time slot in a frame that is regularly repeated. The con- tents of the transmit buffers are multiplexed onto its respective transmit TDM bus during its time slot and the contents on the receive TDM bus are demultiplexed into its respective receive buffer during its time slot. The time slots given to all buffers for a particular PE are the same. Since we have separate outgoing and incoming communication paths, PEs can simultaneously transmit and receive information packets during the same time slot. 35 UT on IT IR from P CU Q Flags I M cpu CILU ‘—— Control *r--'---4I~ r-—~ H I Data Bus 0T - Outgoing transmit bus OR - Outgoing receive bus IT - Incoming transmit bus IR - Incoming receive bus CPU - Central processing unit M - HeHIOry CILU - Conmunications interface logic unit Fig. 3.3 Block diagram of a TDMP processing element. 36 Nu.— .msumq mcowumowaaaaoo mm wafiowuao can wcaaooca mo OHemem q.m ouswwm mace; _ x mason .3; mecca mecca . a: ecu: e: no a: see: a co cam . _ 388.. «38 . .8 . . _ 38.5. .33 — e. 38 37 3.1.2 Tine-division switching network. The time-division switching (TDS) networks are used to permute the information packets among the PEs, Figure 3.1. There are two switching networks in TDMP, an outgoing TDS and an incoming TDS. The outgoing TDS has inputs from outgoing transmit TDM buses and has outputs connected to incoming receive TDM buses. The incoming TDS has inputs from the incoming transmit TDM buses and has outputs connected to outgoing receive TDM buses. Both TDS networks are identical and operate in exactly the same manner. To switch packets between PBS, the packets from individual time slots on one bus are placed in the same or difl‘erent time slots on other buses. The interchang- ing of time slots is essential to time-division switching.32 The operation of time slot interchang- ing is illustrated in Figure 3.5 for a single time-division multiplexed bus with four time slots. In this example, the input time slots 0 and 2 are interchanged; i.e., the contents on the input TDM bus during time slot 0 are placed on the output TDM bus during time slot 2 and vice versa. To switch information packets between buses, space-division switches are used to inter- connect (permute) TDM buses for each time slot period. This is known as “time multiplex switching."32 Figure 3.6 shows a TDS result in which both space and time switching occurs. Time slot 0 of TDM bus 0 is interchanged with time slot 2 of TDM bus 1 and vice versa. Figure 3.7 shows a block diagram of a single input/output TDS network. The architecture of the switch is extremely simple, but surprisingly powerful and flexible. The switch consists of addressable input and output registers, dual central processing units (CPUA and CPU,). Each of these CPUs has its own program memory (PM, and PMs), read/write modules, and multiplexors/demultiplexors. The processors (called network processors) operate concurrently and perform all the switching functions according to a stored program in each PM. All data in read/write memory (RAM) and I/O registers except the input address register are words in a common data memory (DM) and the only operation of each processor is to move a word from 34 some location in DM to another location in DM. A separate DM is associated with each pro- cessor. A move operation takes exactly two memory cycles, the FROM address is read from 38 rsa rsz r31 rsol Time-§lot lrss rsz rsr rso ( 1 I 1 fi 7,— I_(Time Switch) T51 - Time Slot 1 NOTE: PEO +1Paz (1-1,2,3,4) PEZ +~PEO Fig. 3.5 Example of time slot interchanging in the TDS. 39 153 152 rsr rso rss 752 151 750 J PE3 , 9:2 1 PEl 1 PEG T,fl. D,v,s,°n 9:3 1 PE? 1 PE1 1 PE6 l ‘T "" '1 ' Sutton 1 I I Bus 0 (Time ‘ space Bus 0 rss 752 rsr rso S'I‘C“) rss 152 151 150 _1 P27J P56 1 was J 954 9:7 I PEOJPES 1 PE4 ‘ n I m 1 —I I Bus 1 Bus 1 Note: PEO - 9E6 pas - pro Fig. 3.6 Example of both space and time switching in the TDS. 4O .xuosuoc weazoufizw ceamfi>fivtoawu unnu=o\u=ncfi mawcwm m mo Emhmmfiv xOOHm n.m .mwm scum—us: a E SE .- L OIR UL ‘ < n:¢ eacace< . _ < ammo: zen n—unu cut: scene: at: <2D A goo—a ale...— x.l.e: one»... 5.: _ .. ebeam-.ep sub-c =3: Oa_em accuse a. I I .4 . . can can: uaeunc u:___e:u.m mccaaac nth um—h use “any: ui..=e u:_k 0:0 mEm—cflx tits usee_ « ear I I. la - nu _ . . xenon death < e.e.m-_eh . E 0-! use: . . ¢e___-euun . . vantage . nah n<3m 11.11 nmcuv—i car can: canes a... eu uses: ¢¢~hcw mxuhmuuns hnnlu Itasca: .eo—uneuuaon . . uancu . so...» 41 PM and is used to read at that address from DM, then a T0 address is read from the next loca- tion in PM or from the input destination address register, A, which is the address where the word is written in DM. In the TDS network, this move operation corresponds to putting the input information packet data register contents into the CPU hold register and performing time slot interchanging on this data by storing the data packet in a memory location in DM that represents the same or diflerent time slot position. When the input destination address register is used in the TO cycle of the move operation, the switching is called dynamic and the proces- sor move instruction executed is called 'indirect move.“ When the PM address data is used in the TO cycle of the move operation, the switching is called static and the processor move instruction executed is called 'direct move.“ The operations and capabilities of the individual network processors of this type architecture are developed by Lipovski and called the C-move processor.34 However, the network processor differs slightly from the C-move processor in that it allows an addressing mode for which the address in TO cycle of the move operation is taken directly from a special address register and not PM. This change enhances the switching speed of the TDS network. Each input packet is serviced sequentially according to its time slot posi- tion. The processor completes its service cycle for each packet during one time slot. After one frame, all data packets have been stored into DM. During the next frame, all of the stored data packets are moved out sequentially from RAM locations in DM to the output register loca- tion in DM. Since each processor must first store all packets during one frame then read out all packets in the next frame, two CPUs are needed so that when one CPU is inputting packets the other CPU is outputting packets. CPUA uses DM,. for its input move operations; however. the input registers are only associated with DM,. when CPU,. is inputting packets, otherwise these registers are associated with DM, which is used by CPU, in its input move operations. Simi- larly, the output registers are only associated with CPU,. and DM,. when CPU A is outputting packets, otherwise they are associated with CPU, and DM, when CPU, is outputting packets. A multiplexor and a demultiplexor are used to connect the input and output registers to a DM 42 by using framing information to make the correct association. We have two frames called A and B. During frame A the input registers appear as memory locations for CPU, in DM, and the output registers appear as memory locations for CPU, in DM,. During frame B the output registers appear as memory locations for CPU, in DM, and the input registers appear as memory locations for CPU, in DM,. The execution of the processor program in both CPUs is in synchronization with the input and output TDM information packet buses. The input is synchronized by using the input flag bit as a predicate for program execution, using the input switching address as the program counter for PM, and using time slot sizes greater than or equal to the time it takes the network processor to execute the instructions pointed to by this switching address. The output is syn- chronized by using the time slot counter value as the program counter where the instructions pointed to in PM sequentially output stored packets and flag bits. 3.1.3 Control unit. The control unit (CU) interfaces with 1/0 or a host processor, initializes the time-division switching networks, initializes the PEs, generates and sends timing pulses to PEs. The initialization steps are done prior to the beginning of program execution. Initializa- tion of the time-division switching network consists of storing in program memory the program switching instructions associated with control and time slot interchanging and resetting the net- work processor to begin execution. The initialization of PEs consists of resetting the CPU and storing instructions, data constants, and initial values into each PE memory. The control unit has direct access to the network processor’s program memories for program loading and pro- gram modification purposes. However, CU access to the PE memories is through the switching network. The CU has outgoing transmit and receive TDM buses and incoming transmit and receive TDM buses. These buses are connected to the TDS networks in the same fashion as the PEs, Figure 3.1. However, the CU has a multiple number of time slots per frame for com- munication purposes as compared to the PEs which have one time slot per frame. The CU can 43 use these time slots to send and receive information packets to and from individual PEs and to send control packets to the TDS network processor. The control packets modify the network processor program counter, which causes a jump to a subroutine that services that input control message. 3.2 TDMP Implementation There are several important attributes of TDMP that make it well-suited for VLSI implementation. The first attribute is that the majority of the processors are identical. This produces a major benefit in decreasing the layout time and effort of the architecture. The decrease comes about because the use of identical structures reduces the total number of dev- ices which must be individually drawn. In addition, the more structured layouts are easier to validate. A second attribute is that the processors and memory are in close proximity and can be implemented in the same technology. The locality of processing reduces the hardware and software communication overhead. A third attribute is that the underlying communication geometry, time-division multiplexed buses, is simple and regular which leads to high density and, more importantly, to modular design. In this way large TDMP systems can be developed as a collection of many simpler TDMP systems. A fourth attribute is that the switching net- work is regular and has full-access and non-blocking capabilities with simple and efficient con- trol. The regularity in the switch comes about through the use of semiconductor memory for switching the data. This use is compatible with V151 and future technologies, since these tech- nologies will decrease both cost per bit and memory access time. The flexibility in the architec- ture brings about a universal appeal. As a result, V151 cost advantages can be obtained by the wide variety of applications in which it can be used efl'ectively. However, this is not to say that TDMP is suitable for all applications. The last attribute is that overall architecture of TDMP is highly compatible with algorithms that use a packet form of data movement such as data flow.35 In data flow algorithms, there is no need to maintain long and continuous connections 44 between source and destination such as in block transfers of information. So in TDMP, pro- cessing elements send information packets to other processing elements only during designated time slot periods. 3 .3 TDMP: A Data Flow Processor TDMP executes programs expressed in data flow notation. These programs are nor- mally described as program graphs which represent the data dependencies between operations. The attractiveness of such a system lies in the fact that it is data-driven; that is, each instruc- tion is enabled for execution just when the required operand(s) has been supplied by the execu- tion of a predecessor instruction(s). Since data flow instructions have no side efl'ects, unrelated instructions can be executed concurrently without interference if each has its required operands. In this sense, the progress of a computation is determined by the passage of data through the system. Principal advantages of the TDMP data flow over conventional designs are reduced complexity of the processor interconnection network, greater use of pipelining, and a simpler representation and implementation of concurrent activity. To illustrate the basic concepts of TDMP data flow operation, consider the data flow program shown in Figure 3.8. This program represents the computation required for a second order recursive digital filter I’(n) -= AX(n) + BX(n-1) + CY(n-1) + DY(n—2) where X(n) and Y(n) denote input and output samples for time nT, where T - 1. In this diagram, PE operators 2, 3, 4 and 5 are single-input operators that multiply by the fixed param- eters A, B, C and D; PE operators 6, 7 and 8 are two-input operators that perform addition; and PE operator 9 is an identity operator that transmits its input values unchanged. Each small solid dot is a link that receives results from an operator and distributes them to other operators for use as Operands. Input operator 1 represents the outgoing transmit control unit port through which an external stream of values that represent the input signal X(n) is presented to ‘H-Zlfi. \l X(-l) .0 N . Fig. 3.8 Data flow program graph for a 2nd order difference equation. 46 the program. Similarly, output operator 10 represents the incoming receive control unit port at which the sequence of values representing Y(n) is delivered during program execution. The large solid dots (tokens) show the presence of values at certain input arcs of operators and define the initial configuration for program execution. An operator with tokens on each of its input arcs and no tokens on its output arcs is enabled, and may fire by removing the tokens from its input arcs, computing a result using the values associated with the tokens, and associ- ating the result with a token placed on the output arc of the operator. A link is enabled when a token is present on its input are and no token is present on any of its output arcs. It fires by placing tokens on each of its output arcs and removing the tokens from its input are. The new tokens distribute cOpies of the value associated with the input token over each output are of the link. TDMP can perform conditional execution by using PEs to perform the primitive branch operation. The branch operator selects one or two output arcs (destination addresses) on which to place (send) its first input data, according to the state of a secOnd Boolean input value. The two possible firing states lead to the execution sequences shown in Figure 3.9. The operator can be used to achieve conditional expression evaluation at a higher level. Figure 3.10 shows a natural translation of the high level conditional assignments: ABS: =IfA20THENAE15E-A. Conditional flow graphs should be constructed with caution, since the absence of tokens flowing down some arcs might leave other tokens stranded at inputs to nodes. Conditional expressions, such as the one in Figure 3.10, are 'safe" provided that both THEN and E15E expressions are stated and are of the appropriate type. Conditional evaluation combined with cyclic or reentrant flow graphs proves TDMP extremely powerful. For example, an iterative or loop construct can be implemented by conditionally deciding whether to send tokens to the next block in a pro- gram, or to recycle them through the current block. However, problems can arise in using 47 X BRANCH ::::> BRANCH False F T F T X BRANCH :> BRANCH True I F T Fig. 3.9 Example of a data flow branch operation. 48 Initially A Eventually ABS(A) Fig. 3.10 Example of higher level conditional expression evaluation. 49 some reentry flow graphs because of blocking and token identification. Gurd and Watson dis- cuss these problems and sight several examples to illustrate the issue.36 Dennis first proposed a very basic version of a data flow language in which instruction execution was limited only by the data dependencies of the program.” Dennis and Misunas did preliminary work into the design of a computer based on this language.35 Raumbaugh has expanded and improved the earlier version of the data flow language proposed by Dennis and has developed a multiproces- sor architecture consisting of N identical activation processors.38 Each processor is capable of executing in a pipeline manner several data flow instructions at a time. The TDMP data flow processor is conceived as using each processing element as a combined operator and link, or just a link. The time-division switching networks in TDMP are then used as a means by which the link operations can be carried out. If a particular data item is to be used concurrently in more than one place or time in the system, then that data item must be explicitly sent to the multiple processing elements where it will be used. By using submulti- plexing in both the PEs and switch, A frame and B frame, two items can be sent to diflerent destinations. Submultiplexing is an important part of TDMP when transmitting to multiple locations in order to prevent two PEs from simultaneously transmitting to the same PE during the same frame. Data items are transmitted in the form of an information packet. A PE information packet contains five fields; flag, switching address, destination address, tag, operand and signalling. The switching address and destination address are used by the switching network to direct the tag, operand, and signaling fields of each packet to the correct destination. The tag field is used by the PE to identify what operator in PE memory the data operand is associated with. By stor- ing multiple operators in memory, task or program switching is performed more quickly. The number of multiple operators is limited by the PE memory size. The signaling field is used by the PE to transmit and receive request and acknowledge signals when exchanging information 50 packets between PEs. Once placed on a data path, packets remain there until destroyed as a consequence of being acknowledged by the destination PE. The packet is placed into a firing set of the PE using the tag field. Any PE can execute any machine language program providing that it has sufficient amount of storage. The machine language program defines the operation performed by the PE; e.g., arithmetic or logical type operations. If the incoming packet data does not make the receiving PE firable, then the packet data is stored in the appropriate place in the PE memory. This situation occurs when the received packet does not completely satisfy the needed operand requirements of the PE. For example, if the PE needs two Operands to be firable and has previously received none, then the incoming packet will be stored until the second operand has been received. If the incoming packet data makes the PE firable then the PE fires immediately, using the packet data as needed. When the PE produces a result, it for- mats an outgoing information packet for each copy of the result that is needed. CHAPTER 1V TIMING CONSIDERATIONS In Chapter 3 the PEs and control unit were described as time-division multiplexed circuits when transmitting and receiving information packets. The timing relationships of the control signals necessary to convert the PEs and control unit into time-division multiplexed cir- cuits were not discussed in Chapter 3. In Section 4.1 of this Chapter, we describe these timing relationships in detail with a timing diagram. In Section 4.2, we develop an expression that bounds the maximum multiplexing and demultiplexing rates achievable in TDMP. The max- imum multiplexing and demultiplexing rates fix the minimum time slot size on the time-division multiplexed buses. The minimum time slot size in turn determines the logic one and logic zero time duration requirements of the timing control signals discussed in Section 4.1. The TDMP switching network has an inherent delay to aflect switching of the information packets. In Section 4.3, we present an expression that bounds the maximum switching delay through the time-division switching (TDS) network in TDMP. 4.] Timing Diagrams The timing diagram of the control signals that convert the control unit and each PE into a time-division multiplexed circuit is shown in Figure 4.1. These control signals consist of P, Q, TWD, RWD, A and B frame. In this diagram 16 time slots are created (0-15) by using four Q signals (Q0, Q4, Q, and Q”) and four P signals (Po, P,, P, and P3). Each PE has one Q signal and one P signal connection. The control unit has one P signal and four Q signal con- nections. The simultaneous occurrence of a logic one on both the Q and P signals with logic one on TWD for transmitting and logic one on RWD for receiving select the control unit or a PE for multiplexing and demultiplexing information packets. TWD and RWD are clock signals used to multiplex and demultiplex the information packets to and from the time-division 51 52 .Eouwmm mzna uon mafia 0H m pom mamcwwm Houucoo can mafiawa ago .mHh 1FTCC EC1EFTCC1—LI JL1FJIJITCCFTEEEF71C|F 11411:. .11.. E FI1._ TIL 71L r1L E _.11_1 1.. E FIIL F1 _ w h — 3.3.:Ea:1.1.1eEa11%;.ages; It. "1 53 multiplexed buses. The 4x4 matrix shown in Figure 4.2 is an alternative representation of the P and Q signals used for creating each time slot position. The numbers in each box represent the time slot position created. In addition the time slot positionfs) for the control unit and each PE can easily be determined by simply adding together the subscripts on the P and Q sig- nals for that PE or control unit. For example, P3 and Q4 generate time slot 7, 3 + 4 -= 7. In TDMP time slots 0, 4, 8 and 12 are given to the control unit and the remaining time slots are used by the PEs. The length of time for which both Q and P are simultaneously logic one sets the time slot size. This time slot size is a function of the maximum multiplexing and demulti- plexing rates and will be discussed Section 4.2. The A and B frame control signals are used for submultiplexing purposes. Separate registers are used to hold A and B frame information pack- ets. When A frame is logic one only A frame information packets are multiplexed and demulti- plexed and when B frame is logic one only B frame information packets are multiplexed and demultiplexed. However, A and B frame cannot simultaneously be logic one. More levels of submultiplexing can be employed in TDMP, however increasing the amount of submultiplexing also increases the transmission delay of individual packets within the system. 4. 2 Maximum Multiplexing and Demultt'plexr’ng Rates To determine the maximum TDMP multiplexing and demultiplexing rates we must start at the Time Division Switching (TDS) network. The TDS network works on a sampling basis. The transmitted information packets on the input of the TDS are sampled then stored in random access memory (RAM). The time between information packet samples is called the "sampling period” of the switch, T,. The minimum sampling period is bounded below by the read/write cycle time, t,,, of the RAM plus the control overhead, t‘, for processing the infor- mation packet, T, 2 t... + t,. The control overhead consists of the time for scanning the input for new samples, testing the frame condition, determining the type of service requested and reading the destination address. A flow chart for the TDS control algorithm is shown in 54 Q0 Q4 Q3 Q12 r0 0 4 a 12 v1 1 s 9 13 92 2 6 1o 14 r, 3 7 11 15 Fig. 4.2 Matrix representation of the P and Q control signals. Note: The numbers in each box represent the time slot position created from the P and Q control signals. 55 Figure 4.3. The minimum read/write cycle time, t,,, is fixed by the current state of the art of semiconductor memories and as the technology improves I," will decrease. The control over- head, tc, will also decrease as technology improves, however further improvements can be obtained by maintaining precise synchronization of the TDS with the input multiplexed buses thereby eliminating the scanning of flag and frame bits used for synchronization. In addition, if we provide for only one type of switching service, then we can eliminate the control time needed to determine the type of switching service requested. In this investigation we will take the conservative path of keeping the extra control overhead since our primary purpose is to determine the feasibility of using time-division techniques to enhance computation and further enhancements can be made by the elimination of the extra contrOl overhead. As a result, the switch sampling period puts a lower bound on the minimum PE time slot size T: Z T, where T, is the time slot size and T, is the switch sampling period. The minimum time slot size fixes the maximum multiplexing and demultiplexing rates in the system, 1 l < - me— T, (min ) t," +tc In our control algorithm for the time-division switching network used in TDMP I, = 8t“, Fig- ure 4.3. Therefore in TDMP the maximum multiplexing rate is, I < .— me — 9‘," 4.3 TDS” Maximum Switching Delay The maximum switching delay of the TDS in TDMP is a function of the time slot size, the number of time slots per frame, and the process of time slot interchanging. In TDMP 16 time slots make up one frame. Each PE is assigned one time slot and the control unit is assigned four time slots. These frames are regularly repeated and alternated via .56 7 IwVE Tsc To PC MOVE D}: TO DO’L‘ T Fig. 4.3 Flowchart for TDS control algorithm. 57 submultiplexing between A and B frames. As described earlier, in Chapter 3, the TDS network switches information packets by providing for the association of input time slots with output time slots. All input information packets are processed sequentially and stored into RAM, then read out sequentially from RAM. This process of reordering the sequential input information packets by storing them into RAM locations specified by destination addresses and then reading out the RAM contents sequentially is called time slot interchanging. This technique implies a time delay to afl'ect a change in time slot position. The delay is the amount of time it takes the TDS to store all the input information packets for one frame into RAM locations representing time slots positions plus the time it takes the TDS to sequentially output all these stored infor- mation packets. As a result, the maximum switching delay can be predicted by the following expression, T, - 21vrJr where N is the number of time slots per frame, TX is the time slot size, and the 2 is a result of the time slot interchanging process. In TDMP N - 16 and if we assume that the read/write cycle time of RAM is 30ns then T, - 270ns and the maximum switching delay in the TDS equals To = (2)(16)(270ns) TD - 8.64 #8 The TDS switching delays results in a maximum packet switching delay of 17.28 us. The packet switching delay is twice the TDS switching delay because of the A and B frame submul- tiplexing in which packets are transmitted every other frame. However, if the same informa- tion packets are sent during both A and B frames then, the maximum packet switching delay equals 8.64 us. If the switching control overhead is reduced to zero the time slot size 58 decreases, TX - 30ns, and the maximum switching delay becomes 7‘, - .96 us. The maximum switching delay in the TDS network increases linearly with N. TD " A IN where N is the number of time slots on the TDS input and output time-division multiplexed buses and A . is a weighting coeflicient of the TDS network. The coeflicient A. is determined from the switching implementation, control algorithm, and any other factors that might influence the switching speed of the TDS network. The other factors might include fault detec- tion built with the hardware and diagnostic programs in software to improve reliability of the network. Many of the popular multistage interconnection networks such as binary n-cubc, banyan and shuffle exchange have a switching delay which increases logarithmically with N, TB " 421082 N where N in these networks represent the number of input and output ports of the network and A2 is its weighting coefficient.40 The coefficient A; is determined from the same factors that are used to determine A, in the TDS network. In comparing the switching delay of intercon- nection networks the rate of increase, N or logzN, is often used as a measure of performance. This is a useful measure when N is very large. However, for practical systems built with tech- nology we can use in the immediate future, we must not only consider the rate of increase but also the weighting coeflicients that multiply the rate of increase. If the weighting coefficients are not used in determining the maximum switching delay, then the conclusions drawn from such as analysis might be wrong. As an example, if we compare the maximum switching delay of TDS and the popular interconnection networks based only on the rate of increase, then we would conclude that the popular networks have a maximum switching delay less than that of the TDS network for all N .>. 1 since logzN < N for all N 2 1. However, in Figure 4.4 we 59 .osmmfi mmaov wcHJOuH3m zNwOH Amuuoaano u=«:0u«anv z ~< mamuo> zH< OSu Oumuumsaafi Ou mauamxm c< q.q .me 83 Hz 2: 2 u n n n n cog ~2u< A52: 3“ u scrap Intonatns 60 show that the maximum switching delay of TDS is less than that of the popular networks for some values of N 2 I, if A. < A1. This example points out that the issue in comparing max- imum switching delay is not just the rate of increase, N versus logzN, but is A,N versus AzlogzN. It has not been shown but is conceivable that A, << .4, since the pOpular type inter- connection networks, referred to as logzN, may have to make multiple passes through the net- work to reach the final destination and messages may have to wait as a result of blocking in the network. These factors influence the A; coeflicient but not A1,. However, the applicability or superiority of one interconnection network over another should not be based only on switching delay. For example, unless efficient algorithms which use these networks and simple operating systems that manage their resources and supervise their processes are found, the discussion of switching delay is moot because the system as a whole will not be cost-effective and will experi- ence considerable amounts of down-time due to software and hardware failures thus reducing the system throughput. We feel that the TDS network offers some significant advantages over the logzN intercon- nection networks in algorithm mapping and operating system implementation. The mapping advantages results from the non-blocking and full-access interconnection capabilities of the net- work. Since all permutations are possible in TDS the programmer can construct or restructure algorithms without concern about interconnection capabilities of the network. And because the network is a C-move processor, many of the operating system functions can be directly imple- mented into the network, eliminating much of the time and hardware needed in supervising the switching process and in communicating and acting on exceptional conditions arising during switching. The TDS switching delay is tolerable in TDMP system if we consider that while some PEs are transmitting and receiving information packets through the TDS network other PEs are per- forming computations. As a result throughput i.e., the quantity of useful information pro- 61 cessed by the system per unit time, is a better measure of performance for TDMP than is the switching delay of the TDS network in TDMP. And since TDMP is based on data flow princi- ples and the TDS network has full-access and non-blocking interconnection capabilities the decomposition of programs is an easier task to perform in programming sufficient parallel activity into software to keep the parallel hardware fully occupied. It was shown in this chapter that each PE only needs two signals, P and Q, to convert it into a time-division multiplexed circuit and that the minimum time slot size generated by these sig- nals is determined by the maximum TDMP multiplexing and demultiplexing rates. The multi- plexing and demultiplexing rates were found to be dependent upon the sampling period of the TDS network. We also showed that the TDS network has an inherent switching delay that increases linearly with the number of time slots, N. We presented arguments as to why the rate of increase of switching delay in TDS and the popular logzN interconnection networks was not sufficient to determine the superority of the logzN networks over the TDS network. In the next chapter, we describe a simulation model of the TDMP architecture. CHAPTER V TDMP SIMULATION MODEL A simulation approach was chosen to analyze TDMP because it provides a timely and economical means of evaluating preliminary designs. Moreover, the versatility of this approach means that a wide range of alternative designs can be realistically analyzed. The use of a simulation model in our investigation has a two-fold purpose. The first is to analyze TDMP’s system performance, e.g., the switching capabilities and switching delay, the communi- cation protocols between processing elements, and the ability to execute a single data flow pro- gram at a time. The second purpose is to provide a detailed language description of the TDMP hardware that complements the less detailed block diagrams describing this hardware. As a result, the simulation language in our investigation has the following characteristics: — capability to simulate concurrent activity; — capability to simulate timing: — flexibility to simulate hardware at both the gate-level and functional-level; — easily translates hardware representation into a computer program. Several simulation languages were available to us to choose from for our simulation model. These languages were SPICE, ECAP, GPSS, and APL. SPICE and ECAP are circuit level simulation languages and are extremely useful in modeling the dc, ac and transient eflects in circuits.41 However, these languages are too low-level for our current simulation purposes because they model hardware at the transistor-level. These languages would be more useful to us when we have verified the correct operation of the system at the logic gate-level and are ready to test the design performance at the transistor-level for integrated circuit implementa- tion. 62 63 GPSS is a simulation programming language used to build computer models for discrete- event simulations.42’43 GPSS is a process-oriented language and is particularly suited to the translation from a flow-chart representation of a system into a computer program. This is a disadvantage for our simulation purposes because the computer program representation that results is too high-level. We have a hardware representation of our system and would like the computer program model to complement the less detailed block diagrams describing the hardware as well as to simulate the processes in the system. GPSS would be more useful in simulating the protocols in TDMP and not the hardware design. APL, however, meets most of our desired simulation language needs.44‘45’46 APL has the ability to perform both functional and gate-level simulation and the model implemented in this language neatly complements the less detailed block diagrams describing the hardware. The ability to perform both functional and gate-level simulations gives us the diversity to synthesize some parts of the model at a high-level using functional modules and some parts at a low-level using gates. Functional modules in the TDMP model tend to be those parts of the architecture that are not germain to the TDMP operation, e.g., arithmetic logic units, registers, memory storage units, etc. The gate-level models tend to be those parts of the TDMP architecture that are unique to its operation, e.g, control units, interface logic units, multiplexors, demultiplex- OI’S, etc. All of these simulation languages have short coming when they must simulate concurrent processing. These languages are based on sequential computation with a single processor, as a result they cannot at all or with limited success model asynchronous events or concurrent operations. In addition all of these languages are very costly and require fast, high capacity computers. Given the advantages and disadvantages of each simulation language we chose APL as the language in which to build our simulation model. This chapter begins with a description of the APL model of the time-division switching net- 64 work, Section 5.1. Then in Section 5.2, the APL simulation model of the processing element is described. And, finally, in Section 5.3, the operation of the complete APL simulation model of TDMP is discussed. 5.1 TD? Simulation Model The block diagram of the data paths for the time-division switching (TDS) network simulation model is shown in Figure 5.1. There are two identical sets of data paths in the TDS network, one for switching A frame information packets and one for switching B frame infor- mation packets. As a result, only one set of these data paths are described in presenting the TDS network simulation model. The bused data paths in the model consist of DBUS, OBUS, IBUS, INBUS and OUTBUS. DBUS is the network processor data bus. All data moved to or from memory, registers and hardware operational units is placed on DBUS. OBUS and IBUS are internal use buses of the network processor. These buses connect the inputs and outputs of the network processor internal use registers to DBUS. INBUS and OUTBUS respectively con- nect the switch input and output port registers to DBUS. Many of the remaining data paths consist of gated input and output connections to these buses from registers, memories, and hardware operational units such as an adder, decrementer and incrementer. By gated connec- tion we mean that a component must be selected in order to put data on a bus or take data ofl‘ a bus. The gates are represented by the small circles on the data paths shown in Figure 5.1. Specifically, INBUS has gated connections to the input and output of the input flag (INF) register, the outputs of the switching address (SWAD) register and input data (DIN) register. The INF register signals a request for service to the switch during an input packet operation. While the SWAD and DIN registers respectively contain the address of the type of service requested and the input packet data to be switched. The OUTBUS has gated connections to the input and output of the output flag (OUTF) register, the input of the data out (DOUT) register and the output of the time slot counter (TSC) register. The OUTF register signals the switch ti:- '— .Haeoe .5323: 5.2 m8 2: do seamen. 383 «an .3... ...g 3 .8 .88 67 for service during an output packet operation. While DOUT register contains the switched out- put data packet and TSC provides addressing used by TDS for demultiplexing the appropriate output data packet to DOUT. It is important to note that INBUS and OUTBUS are bidirection- ally gate connected to the processor data bus (DBUS). The DBUS is bidirectionally gate connected to a non-addressable HOLD register, data memory (DM) module, and addressable internal use registers (IR). The HOLD register tem- porarily stores data during processor move operations. While DM is used primarily for writing in and reading out data packets via the HOLD register during the time slot interchanging pro- cess. However, some parts of DM are also used as a scratch pad in conjunction with condi- tional move operations. The IR registers are the program counter (PC), the jump to subroutine (JSR) and index (X). The PC register contains the address of the next move instruction in the processor program memory. The JSR register stores the return address in jump to subroutine operations and the X register is used for index addressing, and up-down counting. The remain- ing IR registers, S and C, are additional index registers. However, it was found that they were not needed in this simulation so they are not fully implemented for index addressing. Other gated DBUS connections consist of outputs from the decrementer and incrementer Operators and the inputs of the N flag bit register, decrementer and adder operators. The N flag bit is used by the TDS control unit to test for conditional moves. The N flag bit in conjunction with conditional moves is described in the TDS control unit simulation model in Section 5.1.1. Some of the remaining data paths consist of connections from the memory address register (MAR) to the memory address decoders of the input, output, data memory and internal-use registers. These decoders are not explicitly shown in Figure 5.1. However, these decoders enable gates if the MAR value is in the address space defined for that decoder. In this simula- tion the internal registers are given memory address locations 0-7, data memory has locations 8-63, the input registers have 64-71 and the output registers have 72-79, Figure 5.2. Gate 68 80 Output 72 Input JH__ Data Ila-cry 8 Internal Use 0 Figure 5.2 TDS simulation model memory map. 69 signals GIR, GDM, GIN, and GOUT respectively gate the internal registers, data memory, input registers, and output registers data onto and from the DBUS. The remaining decoder sig- nals select specific registers within these groups. The MAR has gated input data path connections from the destination address register (A), program memory bufl‘er register (AR) and the output of the adder (ADD). Gating A into the MAR is a result of an indirect move, gating AR into MAR is a result of a direct move and gat- ing ADD into MAR is a result of an indexed move. The few remaining data paths consist of gated connections from the HOLD register to the incrementer input and program counter register connections to the incrementer input and out- put for incrementing the PC. And finally, we have the data path gated connections from the program counter to the program memory address register (PMAR) and bidirectional program memory data bus (PMDATA) with gated connections from the program memory (PM) to the program memory bufl'er registers (AM and AR). The program memory contains the TDS switching program consisting of only move instructions. A move instruction consists of an addressing mode and address. The AM register holds the addressing mode value in the From and TO cycles of a move instruction. In the FROM cycle these addressing modes are immedi- ate, direct and indexed. In the TO cycle, these modes are direct, indexed, conditional and indirect. The AR register holds the address value in the FROM and TO cycles or in the case of immediate move the value of the data to be moved. As a result, the AR register has a gated connection to the DBUS for sending data to the HOLD register. Both AM and AR are inputs to the TDS control unit which performs the address evaluation and generates the timing and control signals necessary for all move operations. 5.1.1 TDS Control Unit Figure 5.3 shows a block diagram of the TDS control unit. The control unit consists of a combinational logic network and memory. The inputs to the control unit are: the least 70 .HMRCD n —# .___... Bartram. y ..___. so CODINA‘I'IGIAI. s :D were AR___. crooner 81 a nerve». :D- o 1 cu sun: aacrsrra Figure 5.3 Block diagram of the TDS control unit. 71 significant bit of the program memory address register (PMAR(5)), the addressing mode bits (AM), the N flag bit, the decoder outputs (8,, S ,, SS) and the control unit state register out- put, (2,, 2,, 22). The control unit outputs consist of control signals (Y), and the control sig- nals Do, 0., 0,. The Y control signals gate in both space and time data paths in the TDS net- work. The D control signals generate the next state of the control unit. The least significant bit of the program memory address register, PMAR(5), determines the cycle in the move Operation. If this bit is logic 1 the move cycle is identified as FROM and if this bit is logic 0 the move cycle is identified as T0. The AM bits are interpreted by the control unit so that the appropriate gate signals are generated for the type of addressing that is required. For example, in an indexed move the control unif must also generate the control signals that gate inputs to the adder from DBUS and the output from the adder to the MAR as well as the normal read/write memory control signals. In Figure 5.4, Tablel shows the addressing modes as a function of the AM bits and move cycle. The N bit is used as a test signal in conditional move operations and is the most significant bit of each word moved. This bit is gated into the N bit register off the DBUS during the TO cycle of every move operation. In a conditional move, if the N bit is logic 1 the moved data is written into a memory location and if the N bit is logic 0 the moved data is not written into memory. In both cases the FROM cycle of the move opera- tion is performed, however in the TO cycle the N bit condition is tested. The S0, S I and S 5 inputs to the control unit identify internal use registers that are involved with move operations such as increment, decrement and jump to subroutine. The network specification table shown in Figure 5.5 gives a detailed logic description of the properties of the control network. It shows for all the inputs to the control unit all the space and time outputs the control unit produces. For example, if the control unit is in state 0, Z, -Z 1-21-0, the contents of the program counter, PC, is gated into the program memory address register, PMAR. The clock advances the control unit to state 1, 2,-21-0, Z;-=l. During state 1 the mode and address contents of the program memory are placed in the AM From To 72 PMAR(S) AM AM AM AN AM 000 001 010 011 100 0 Immediate Direct Index Unused Unused x l Unused Direct Index Conditional Indirect x Fig. 5.4 Addressing modes for the TDS network Cdmove processor. 73 Fig. 5.5 Control unit logic network specification table. 74 and AR registers and based on the cycle and addressing mode the move operation is executed. If we assume that the addressing mode is direct (AM o-AM,-0, AM 2-1) for both cycles and we are in the FROM cycle, PMAR(5)-l. The next clock pulse advances the control unit to state 2 (20-0, 21-1, 22-0) where it gates the FROM address in the AR register into the MAR register. The next clock pulse advances the control unit to state 3 (lo-0,21-Zz=l) where it gates the contents from the address memory location specified by the MAR to the HOLD register and also gates the data paths necessary to increment the program counter. The next clock pulse advances the control unit back to state 0, where the next three clock pulses cause the next program memory word to be fetched and the MAR to be loaded. However, when the clock advances the control unit to state 3 the cycle will be TO, PMAR(5)-0, so the control unit gates the contents from the HOLD register to the memory address location specified by the MAR and increments the program counter. Figure 5.6 gives examples of all move operations possible with the simulated TDS network C-move processor. In these exam- ples assume initially that memory address location 15 has the value 20 and memory address location 17 has the value 30. 5 .2 PE Simulation Model The block diagram of the data paths for the PE simulation model is shown in Fig- ure 5.7. The data paths consist of DBUS, registers, communication interface logic unit (CILU), arithmetic logic unit (ALU), program memory, data memory and a C-move processor. DBUS is the PE data bus. All data moved to or from data memory, registers, CILU and ALU is placed on DBUS. The registers are partitioned into groups called an outgoing communication port and an incoming communication port. These ports are identical but are used for difi‘erent purposes. The outgoing port transmits request signals and data and receives acknowledge sig- nals. The incoming communication port receives request signals and data and transmits ack- nowledge signals. Since both ports have the same hardware for communication, only the 75 Inn-diate nave lSdll-7l after nve (151-7 Direct nove 151-317 after mite (lS)-30 INDEXED nove (sl-S 15 G-lZIx) after nove (15)-30 lO(x)d-l7 after nove (15)-30 S - indirect l - innndiate (x)- index t - conditional Fig. 5.6 Examples of TDS Conditional nave N-l 15¢ rlF- 17 after nove (15)-30 I'D 15¢ «II- 17 Ifter nove (151.20 INDIRECT aove (A1315 SA-nI-' 17 after nove (15)-30 Initially: HAR(15 ' 20 HAR(l7 ' 3O HAR(X) I 5 processor move operations. 76 .Hovoe cofiumasaam <9..:::. :89. 9232 3". ..I... .P .25.... qm< mm ago we amuwmwv xoon an.m .waa q - . . . . . . o . . . . . . . . . . r ‘------------------J 77 .Hovoe acqumaaawm na< as use we been eh.m .waa 78 outgoing port will be described. The registers that make up the transmitting section of the outgoing communication port are TINFA, TINFB, TSWADA, TSWADB, DINA, DINB, TAA, and TAB. All of these registers are gate connected to DBUS on their input side and gate connected by multiplexor gates to their respective time division multiplex bus on their output side, Figure 5.7. TINFA and TINFB are respectively the A and B frame transmit flag bit registers. These registers signal requests for service to the switch during the A and B frame PE time slots. The request for ser- vice is basically an enable signal that informs the switch if it should perform or not perform a switching operation on its current input data. Since each PE has its own unique time-slot to seize the switch there is no chance of collision between PEs requesting for switching service. This helps to improve the throughput of the system. TSWADA and TSWADB are the A and B frame switch address registers. These registers are initialized by the PE with an address value of a location in the switch program memory. This location in the program memory contains move instructions that perform the switching operations on the switch input data. In our inves- tigation the switch operations will be limited to moving the input data directly and indirectly to the switch data memory module. Direct and indirect move operations were described earlier in Chapter 3. DINA and DINB are respectively the A and B frame input data registers. The input data registers contain the data that is to be switched. Each of these registers is composed of three registers with separate gated inputs and a common gated output. The registers are labeled TRES, TAG, and TSIG with A and B subscripts added to each name to signify their association with either A or B frame. The TRES register contains the result of a previous PE computation. The TAG register contains a value used to identify which operation this result is to be used with in the next step of computation. The use of a TAG is required when the PEs are pro- grammed to perform multiple operations or operations in which data values are reentered into the data flow program. In this simulation each PE is programmed for a single operation and data flow programs that use feedback are not simulated. As a result, the TAG bit is shown but 79 is not actually used in this simulation. TSIG is the signalling bit register used to transmit the idle and request PE signalling states. Transmit registers TAA and TAB are respectively the A and B frame data packet destination address registers. These registers contain an address representing the same or diflerent time slot position of the PE. The switch uses this address to carry out the time slot interchanging operation on the input packet data. The receiving section of the outgoing port consist of OUTFA, OUTFB, DOUTA and DOUTB. OUTFA and OUTFB are respectively the A and B frame receive flag bit registers. These registers are sensed by the PE to determine if new receive data is available. For the out- going port the receive data consists of tag and PE signalling. For the incoming port the receive data consists of operands, tag and PE signalling. The receive data is contained in the DOUTA and DOUTB registers, respectively, for A and B frame data. The receive data registers have a single gated input, whereas the output has a separate gated connection to DBUS for each field in the register, Figure 5.7. The CILU data paths generate the outgoing and incoming PE transmit signalling bits as well as the PE execution enabled signal. The inputs to the CILU consist of the outgoing and incom- ing receive signalling hits, the ALU status bit and the current state of the PE. The receive sig- nalling bits were discussed earlier in this section, the ALU status bit, PST, is either in the idle or busy condition, the PE state information is stored in a register and new state values are gen- erated based on the ALU status, the receive signalling conditions and the previous PE state value. However, no state changes can occur until the CILU has received a clock pulse gen- erated by the PE control program. One clock pulse is generated each time the PE cycles through its control program. The various PE states and explanation of these states is given in Figure 5.8. The CILU is modeled at the logic gate-level. Figures 5.9 and 5.10 show the truth table representation of the CILU design. The CILU was decomposed into an outgoing and incoming 80 PE State Outgoing Incoming Explanation P080 P051 P152 P153 PE State 0 O D O idle O O 1 received operand A O O O 1 received operand B O O l l enabled 1 l O 0 results available 1 l l 0 results available and received operand A l l O 1 results available and received operand b l l l 1 results available and received operands A S I 0 l O O A frame results available 1 O O O 8 frame results available 0 l l O A frame results available and received operand A O l O l A frame results available and received operand l O l l l A frame results available and received operands A and I l O l O 8 frame results available and received operand A l O O l I frame results available and received operand I l O l l I frxme :e;ults available and received operands an Fig. 5.8 PE states for data flow computing. 81 mm OUTPUTS Outgoing Outgoing Outgoing Garrsnt Iecsive ALV Transmit State Signalling Status "Seat State Signalling P050 P051 POIA P0“ P51 P1520 P153 P050 P051 POTA POTS ENABLE __,.__1 o o X 8 S O O 0 0 O O O O X 3 O l l l O O l O O X S l l O O O O O l l O O O X l l l l O l l l O O I O l O l O O l ‘ l X 3 O O O O O l 1 X X l X l l o o o l l O l O ‘ l O l O O 1 o 1 X X X o o o o o l l l O X 0 O O O O o 1 x o X X o 1‘ o 1 o l O O x X x l O l O O Fig. 5.9 Logic truth table for the outgoing part of the CILU. 82 IlPUTS OUTPUTS Incoming Incoming Incoming mrrsnt lecaive ALU Trans-it State gignalling Status Part State Signalling I 2 P 3 RA PIRB PST POSO v P051 P152 P153 PITA PITB liq—La !-—-r—. O O O 0 I I 0 0 O O 0 0 1 0 I I 0 l 0 O O O 1 I I 0 1 O l 1 O I O I I l 0 O O l O I l I I 1 l O l 0 1 O I I I 0 1 O O O 1 1 .I I I l 1 1 O 1 l I I 0 O O 0 O O 1 1 I I l I 1 l O O O O 1 1 I I 1 l 1 l 1 1 I I O 1 1 1 O O O O 1 1 I O l 1 l 1 Fig. 5.10 Logic truth table for the incoming part of the CILU. 83 combinational logic designs to reduce the size of the network, since many of the inputs that affect the incoming network have no effect an the outgoing network and vice-versa. The ALU performs the arithmetic operations in the PE. High-level functional modules were used to simulate the ALU. High-level modules were used because the specific operation of the ALU was not important in analyzing the operation of the TDMP architecture. However, the capabil- ity to move input operands to and results and status information from the ALU is important to the operation. As a result in this model the ALU has two input operand registers called A and B that are gate connected to DBUS. Outputs from the ALU consist of a gated connection from the results (R) register and the status (ST) register to DBUS. The remaining data paths for the PE are the C-move processor, program memory and data memory. These data paths are exactly the same as those described in Section 5.1 for the TDS network. The purpose of the processor is to move under stored program control data and signalling to and from the data paths in the PE. In this simulation the data memory is used as a scratch pad by the processor during condi- tional move operations and the program memory contains the PE control program. 5 .3 TDMP Simulation Model Operation In describing the TDMP simulation model operation, we will assume that the PE and TDS program memories have been initialized prior to program execution. We will begin by discussing the PE simulation model operations followed by a discussion of the TDS simulation model operations. We end this chapter with a summary discussion of the overall TDMP simu- lation model. 5.3.1 PE Operation The flowchart shown in Figure 5.11 and the program code shown in Figure 5.12 describes the programmed control operations of the PE. These instruc- tions are stored and fetched from the processor program memory. The PE begins by moving the ALU status, the incoming port A and B, and outgoing port A and B receive signalling bits to the incoming and outgoing communication logic networks, respectively. The processor 84 @. Y MOVE RESULT 0 A OTC PORT HOV: 016 S TXHT SIG TO OTC PORT mm on ' N ICV A $10 To CILU Y "0' 0 MOVE RESULT ICVS 516 HOV! INC . :3 {fifléfi "°‘°‘ s . E . Wits: a rssr-r s r OPERAND .A 521 ALL‘ 521 u' 70 Au svsr IDLE '.' \ B TINT SIG HOVE STATES TO INC PORT TO CILU G H S . O u at 0:0“ acv , twin n to ALU vr"‘~ .v m: i691: Fig. 5.11 Flowchart of the PE control algorithm. 85 .ovoo anuwoua Houucoo mm NH.m .m«m aahu cu usuauncuun <>u¢ oe« csoe 0» cans cuxoo~u .uxnu aahu xoopu sapu o» nausea e>oz puaneu new aghu o» cannon usoe o» neon Duane» aom puonen at» 0» gene a-oco.a.ccou comaaoo— ante o» Aumxumv enaeee csox nauanet n unmouuao o» naunnoa cs0: anpccoauuccoo nun < hmxh ucuougno on who; agnu no—anou < mcqouuno o» nunnnoa osos annoeoquuecou no» < asap ecaoegao on are; aahu =U¢ o» tacueno m at. cs0: unannouuumcou man a bags as“ on nae; =4_u auc oa acetone < on“ osoa anumcoauqncou u.u ¢ hth on. o» ehaa aaau :apu 0» u—n =>uz ccuomaao use: ammo o» a.» ¢>ux cc_oca=o use: ammo 0a man a>ux on“ e>ox some o» c:.~.a=u.a <>uz as“ sac: aanu oa anucun esox om.o -v 9.. a... -v .25.. —.o -v ao.. mm.._ -v ma.. ..o -v ~N... mo.c -v c.. c.c -v NN... .a.o -v c.m co.— -V om.— om... -v he..m mo.. -v mo... om... -v 2o..m mo.. -v cc... ma.. -v am..m ca.. -v ma.— ce.. -v mm..m cc.. -v as.— .w... -c .a.. c.—.. -v ca.— mm.. «c.. -v aw.— -v 0a.. mm... -v mo.. acumen“ mm oo..ao oo.»a oo.mm ao.mo ~a..o oo.mc ca.p~ oe.mc so.mc ~c..a co.o~ asap» cp.m> =p.n> mp..p cp.ao co.po oo.mc ao.mo ~o..o oo.om wm.>m unoutv¢ In 86 moves the generated incoming port A transmit signalling bit (ITA) from the CILU to the incoming port A transmit signalling register, ITSIGA. If this bit is logic 1, the processor moves the incoming A operand from IDOUTA to the ALU A input. If this bit is logic zero the incoming A operand is not moved to the ALU. The lTA bit represents acknowledgment of receiving the incoming A operand. Similarly, IT B is moved from the CILU to the incoming port B transmit signalling register, ITSIGB. If this bit is logic 1, the processor moves the incoming B operand from IDOUTB to the ALU B input. If this bit is logic 0 the incoming B operand is not moved to the ALU. Next the processor moves the outgoing port A transmit sig- nalling bit, OTA, from the CILU to the outgoing port A transmit signalling register, TSIGA. OTA represents a request to send results as determined by the communications logic network. If this bit is logic 1 the processor moves the results from the ALU output, R, to the outgoing port A transmit results register, TRESA. If this bit is logic zero, the results are not moved. Similarly, the processor moves the outgoing port B transmit signalling bit, OTB, to the outgoing port transmit B signalling register, TSlGB. If this bit is logic 1 the processor also moves the results from the ALU output, R, to the outgoing port B transmit results register, TRESB. The processor now tests the communication logic network enable execution, EXEC. If this signal is logic 1 the processor moves an arithmetic instruction and start execution signal to the ALU input, INST. If EXEC is a logic 0 the arithmetic instruction and start execution signal are not moved to the ALU. The processor now clocks the communication logic network so that it can generate its next state based on its current input and outputs. The processor jumps to the beginning of its control program to repeat this cycle of move instructions. 5.3.2 TDS Operation The flowchart shown in Figure 5.13 and the program code shown in Figure 5.14 describe the TDS network processor operations. The SYNC bit is a syn- chronization pulse that occurs at the beginning of each time slot. This bit is used to 87 PDVE TSCCTO TO DOUT Fig. 5.13 Flowchart for the TDS control algorithm. 88 .ovoo amuwoua Houucou may «H.m .wqm .m.o -v c.. o~.o. a~.o -v o.m ~..p. 2... -v om; 8.5 .cEcLu ex; ucuunca ecu coauneeeo one one so“: a as one» egg a~ucuue »— access lesson; 9 ”coo: - pmh A» hm « vmh cmh nu mm e um cat» as cease; ... o.o -v c.. o=.o~ cute see so.» seek ... mm.. -v =>.. m.pm seen a; caste: ... a.o to o.. om.mm use: ores so.» seep ... ~m.. -v as.. :m.mm anon o» gazes: ... o.o -v o.. ~m..m on“ e an coacaoean coesncoa an on so use: ... 50.. -v oo.= om.o~ any so evacuees” coasaoo. o» acne ... ms.. -v o.. m~.p~ uzum o» cease. .cuu=_ cH ... a.c -v o.. o~.m~ nae» an evacuees» coaasoo_ as asaa ..uuzn a. ... eo.. -v o.m =~ .mm so.»ouo~ one» as Ana—v cage sane. osox ... mo.. -v o~.. .ow..~ unease. sundae a» case ..uu ca ... a~.o -v o.. o~.a. . cc.s:ot sauce a» scan ..»c c_ ... _~.o -v o.m o..». . co.auoo_ ones on test. ssoz ... op.. -v om.. o..m. . ocam a» grass: ... a.o -v o.. a..m. «sate one» an ass“ ..uuz»m c_ ... m..o -v c.m ~.... coasaoou one» ca.5.a uzpm use: ... a».. -s o~.. c..a uaoteoe a; atoscx Beacon; a 89 synchronize the TDS with the input and output time-division multiplexed buses. The TDS pro- cessor moves this bit to a scratch pad location in data memory. If SYNC is a logic one, each of the TDS processors move the FRAME bit to a scratch pad location in their respective data memories. If SYNC is logic 0 the TDS processors continuously moves the SYNC bit to the same scratch pad location in their data memories until SYNC is logic 1. If the FRAME bit is logic one, TDS processor A performs the operations in the input processing branch of the TDS flow chart and TDS processor B performs the operations in the output processing branch of the TDS flowchart. If FRAME is logic zero processor A performs the output processing operations and processor B performs the input processing operations. The input processing operations begin with a test of the INF bit. The INF bit is moved to a scratch pad location in memory. If INF is logic one the processor moves the contents of the SWAD register to the program counter register. The instruction pointed to at this address is a direct move or an indirect move of the DIN register contents or some other type of requested switching service function. For this example, assume the instruction pointed to by the PC is an indirect move, then the processor moves the contents of the DIN register to a data memory location specified by the contents of the A register. The program now jumps back to the test SYNC instruction. If the INF bit was logic zero the program does not perform any switching service and immediately jumps back to the test SYNC instruction. The output processing operations begin by testing the OUTF register. If OUTF is logic zero the contents of TSC are moved to the processor program counter. The instruction pointed to at this address causes a jump to a program memory location that directly move the contents from a location in data memory to the DOUT register and returns to the test SYNC instruction. If OUTF is logic one the processor immediately jumps hack to the test SYNC instruction without moving any data to the DOUT register. 90 5.4 TDMP Simulation Model Summary Because of APL space limitations, all of the PE variables needed to simulate a twelve PE TDMP architecture could not be created and stored in one file. However, by using a set-up and end-up program, we were able to achieve the same effect. The set up program takes shared variables such as program memory, incoming and outgoing port registers, etc., and makes them specific variables for a particular PE by adding a suffix to these variables. When done computing the end-up program stores the contents of these specific variables and removes the subscripts to make them shared variables again. Each time a PE is used it must use the set up and end-up program. The APL simulation model provides us a means of simulating the TDMP system operation; however, timing problems can occur since APL simulation does not include timing simulation methods which could predict timing problems. A timing simulator allows verification of proper circuit behavior in the presence of variation in gate delays. The worst case circuit behavior is obtained based on the minimum and maximum transition delays assigned to the gates is the model. By allowing observation of the circuit whenever simulation time is incremented rather than waiting or assuming the circuit has stabilized, a better understanding of circuit operation may be obtained to aid in circuit design or diagnosing problems. Detail logic gates and timing simulation can be implemented in the APL model by writing programs that perform these func- tions. However, because these functions are not built into the APL simulation the cost for implementing them in our model is expensive. APL is a sequential simulation language, which means it cannot simulate concurrent activity. However, we can achieve the effect of performing concurrent activity by stopping the simula- tion timing clock and sequentially performing computations then restarting the clock again. This is an awkward and expensive way to simulate concurrent activity; still, it is the best alter- native available. But even given these short comings of the simulation language, it does allow 91 us to obtain some useful and significant information in predicting TDMP‘s system performance. This performance will be discussed with the aid of sample computations in the next chapter. CHAPTER VI TDMP PERFORMANCE EVALUATION Chapter 5 described the TDMP APL simulation model and its operation. In this chapter we evaluate the performance of the TDMP design based on this model. In our investigation we did not design many evaluation tools to measure system performance. For example, we didn’t design any hardware monitors for measurements of hardware activity, software monitors for event recording during program execution nor did we develop workloads that represent work expected of the system. We feel that a rigorous performance evaluation of multiprocessor sys- tems employing these techniques is a research project unto itself. In addition there is a ques- tion of whether new performance tools are needed or how do the currently used performance tools need to be extended to be applicable to multiprocessor systems. As a result, we relied pri- marily on monitoring signals and registers in simulated clock time for recording the activity of small portions of the system and monitoring computational results in simulated clock time for measuring total system performance. Section 6.1 discusses the qualitative and quantitative per- formance results obtained from this simulation model. On the one hand, we have the conventional single-processor systems which have many uses but the range of use is limited by the von Neumann bottleneck. On the other hand, we have fixed-array-processor systems which overcome the von Neumann bottleneck but at the expense of limiting its range of use by the fixed interconnection structure. TDMP eliminates or improves upon the range of use limiting factors in both single-processor and fixed-array-processor systems. In Section 6.2 we use data bandwidth as a measure of computa- tional performance and compare the data bandwidths of a single-processor, TDMP, and fixed-array-processor systems in computing a fast-Fourier-transform (FFT) and a digital filtering algorithm. This comparison is done to show how much TDMP improves arithmetic computa- tion over single-processor systems and how much data bandwidth we must sacrifice to obtain 92 93 flexibility over fixed array processor systems. The two algorithms chosen for the comparison represent the important class of recursive algorithms used in digital signal processing. In Sec- tion 6.3, we discuss what has and has not been learned from the simulation. And, finally, Sec- tion 6.4 describes the differences in TDMP data flow processor and Dennis’s data flow proces- 501137 6. I Performance Data The performance data obtained from the simulation model is based on a communi- cation saturated TDMP system with one processing element. By communication saturated we mean that the system bandwidth is dominated by the interprocessor communication and the processor arithmetic execution time is negligible compared to this communication time. With this approach the bottlenecks, delays and throughput we examine are a result of the TDMP architecture only. The PE computes results and transmit these results back to itself through the TDS network. Figure 6.1 shows a block diagram of the simulation model with the PE shown in two parts for clarity purposes. In the model, time slot zero, TSO, identifies the outgoing port of the PE and time slot one, TSI, identifies the incoming port of the PE. With this model we were able to test the communication protocols between PE outgoing and incoming communica- tion ports, the A and B frame submultiplexing. the time-division switching network and the data flow control. The simulation model is synchronized to a master clock. Four simulation clock pulses correspond to one read/write memory cycle, rm. From the simulation we found that the max- imum rate a PE can generate result packets, Tm , is once every three cycles through its control program. The first cycle loads the operands into the ALU, the second cycle initiates computa- tion and the third cycle transmits the results. In the PE control program shown in Section 5.3, this corresponds to 120 t... assuming that the needed operands have been received and the ALU execution time is zero. However, if the operands are not immediately available the time 94 .Hmvoa mink mm oawcwm n no cowumucumoumuu amnwmwv suedm H.o .wam mo .mTIIIIIIuhm xum mm .mTIIIIIliho some umoavou m... as: o E IIIII! as: E IQIIIII, alllllj mo m5 8 95 delay in receiving these operands reduces the rate at which the PE generates result packets. This time delay in receiving operands is attributed to the transmission, switching, and packet flow control. The worst case transmission delay, 1; , is caused by the time-division multiplexing and submultiplexing of the information packets. In the model T, - 2NTX, where N is the number of time slots, TX is the time slot size, and 2 is the number of submultiplexed frames. The worst case switching delay, TD, also equals ZNTx, since in the model the switch sampling period equals the time slot size. The transmission and switching delays are fixed and each packet transmitted must sequentially experience these delays. This results in a pipeline realiza- tion of packets with the sections of the pipe consisting of the packet generation, transmission and switching. Without including the packet flow control and assuming worst case, we found that the model computing with two input operands produces the first result packet in TN + T, + TD units of time and the following result packets every T units of time where T is the maximum of (TN, 7}, To). The timing diagram shown in Figure 6.2 is the one used for tim- ing and control of the two time slot model. From this diagram we find that the time slot size is 98 simulation clock pulses which correspond to 24.5 t,,. If we assume t,,, - 30 nsec, then TD - T, - (2)(2)(24.5)(30 nsec) - 2.94 nsec and TN - (120)(30 nsec) = 3.6 nsec. As a result T - Tm since TD - T, < Tm. For a large number of operations this corresponds to a data bandwidth of 277 kHz where data bandwidth is defined as the maximum number of results that can be generated per unit time, on - iT' However, if N > 2 time slots then To = T, > TN and T 8 TD - 1.47N nsec, resulting in bl, -= 1/1.47N nsec. If the packet flow control is added to the model, the previously computed on decreases Ammo—e uoam 9:3 95 u now Sammie wad—.3. N6 .mwm '- Allla --ili .-.-iil. ...: 1. {em ..IIII .... Iljfilll .... Illll S IJTI ... I4 _ 96 H Fa . _ .. E Ll .. a 1|: |41I ..Iill .l11l I._ .1 H ,.. TIII $1|JJII|§IIIJIIIII 5 1"." .3"— . .1 . ~ .... 1 We.“ “EIIIIIIJ ~ L o . ...l . ... . z... . ... . e. 97 significantly. The decrease in bandwidth is due to the interruption of the pipeline operation and added delay both caused by the packet flow control protocol. The packet flow control protocol is responsible for preventing new result packets from being generated before previously transmitted result packets have been acknowledged. Since the acknowledge signal is returned after the request signal and data have been received, the time between generating new result packets increases by the amount of time it takes the receiver PE to generate an acknowledge signal plus the time it takes the transmitter PE to receive it. The worst case time for generating an acknowledge signal, T, , is 1 cycle through the PE control program or 40 t,,. This time plus the transmission and switching delay time experienced by the acknowledge signal is added in the packet generation, transmission and switching time of the result packet sent. As a conse- quence T-Tm-l-2T,+2TD+T,,,,, = (4.8 + 2.94N) nsec and from the model where N = 2 b - J- -- 93 6 kHz D T . . If sufficient size receiver first-in-first-out (FIFO) queues or receive bufi'ers with arbitration circuitry are used to store and control incoming packets to the PE. then the packet flow protocol MHz if N > 2 time can be omitted and the b0 is increased to 277 kHz for N = 2 or 1 417 N slots. These are the data bandwidths we obtained for the model when we didn’t consider the packet flow control. The cost of the bandwidth improvement is a decrease in system reliability because the acknowledge signal used for packet flow control was our only verification that the transmitted packet was received successfully. 98 6. 2 Comparison of Computational Performance We decided to use the performance data obtained from the simple one PE model of TDMP to determine and calculate the (data) bandwidth of a large TDMP system suitable for implementing FFT and digital filtering algorithms. This decision was based on the high cost of the TDMP APL simulation and the small amount of new information that would be gained in simulating a large system. This single PE model is justified because TDMP is a synchronous system and there is no contention for communication. Each PE is given a designated time slot to transmit and receive data. As a result all PEs are equal and we can determine on a worst case basis the time required for each PE to compute, transmit and receive results. This is different from an asynchronous system where the time for each PE to transmit and receive results varies and depending on the type of interconnection network there may be some conten- tion for communication. The (data) bandwidth was chosen as the performance index in com- paring the computational performance of TDMP, single-processor and fixed-array-processor sys- tems. This choice is more appropriate than the memory cycle time or instruction execution time since it allows for the fact that instructions may be executed concurrently. The evaluation of the bD’s is based on each system performing the same FFT and digital filtering algorithms with comparable processors. We begin this analysis with a comparison of the bD’s resulting from the processing systems performing an FFT algorithm. The FFT algorithm implemented on each processing system is based on the decimation in time principle and is illustrated for an 8-point sequence in the signal flow graph shown in Fig- ure 6.3. In the signal flow graph, there are four columns and each column contains eight entries. For the sake of clarity, the two-dimensional variable y(k,i) is used to denote the value of a given node in the array, where k is the number of the column and i is the number of the component within the column. At the node corresponding to column k and row i, the variable y(k,i) is found from an equation of the form: O i/K X(o) X(l) X(3) X(é) X(S) X(6) X(7) 99 Fig. 6.3 Signal flow graph of a 8-pt. FFT. 100 Y(kvi)"J’(k-l.ir)+W'Y(k"1.i2) (l) where r", i; and r are functions of the location within the array and W - (”2"”). In each use, the dashed line connecting the variable in column k-l with column k refers to the first term on the right-hand side of Equation (1), i.e. the nonweighted term. The solid line refers to the second term on the right hand side of (1), i.e., the weighted term. The number in the cir- cle is the degree of W as indicated by term r in (1). The important parameters needed to deter- mine and compare the (data) bandwidth among the three systems implementing this FFT algo- rithm are defined: N -= the number of data points in the FFT T5 - execution time of the processor in performing the butterfly computation. ya - Yr + W'yz M - the average number of FFT operations to be performed Tx - time slot size T - worst use TDMP system delay t, - register delay t,, - read/write memory cycle time on (M) - data bandwidth for a finite number of FFT operations by -= data bandwidth when the number of FFT operations is large (M -> oo) The on analysis assumes that all initialization of the processors has occurred prior to the beginning of the FFT program execution and that each individual processor performs the same computation, y3- y2+ w'yz, in the same amount of time. To compute y, each processor must perform 4 real multipliutions and 2 additions. We also assume that the input and output interfaces for each system is manageable and not a limiting factor in the b0 performance. With 101 these assumptions given, the on of a single-processor system, un be determined from the fol- lowing equation, Lint l l- ”"00 bD'(M) - T5(N logzN) + 6t,,, (N logzN) (2) b0 In equation (2) T5(N logzN) is the arithmetic execution time needed to compute one FFT operation with a single-processor, 6t," (N log,N ) is the accessing and storage time of the real and imaginary parts of the two input operands and the results during one FFT operation. The TDMP by is based on a TDMP system with NlogzN PEs. Each PE is assigned a loca- tion in the N x logzN pipeline array of processors as represented in the signal flow graph shown in Figure 6.3 for N - 8. Each PE is programmed to execute the butterfly computation as a sin. gle data flow operation and to transmit the results of this computation to the next column of PEs based on its loution assignment within the array. This arrangement of PBS form an asyn- chronous pipeline flow of result packets with the synchronization of the computations con- trolled by the data flow principles at the architecture level. The worst use b0 for TDMP is characterized by the following equations; M 1’02”“ (T; + T) logzN + (1‘,E + T) (M-1) (3) and l b Lim __ 0 (T; + T) , - Mm MM) - (4) where T is the worst use TDMP system delay. T is (4.8 + 2.94N) usec if packet flow control protocol is used and 1.47N nsec if it is not used. These two uses correspond to not using and using input queues as buffers in the PEs. The term (TE + T)log2N is the system delay, i.e., 102 the time before the first results appear at the output of the system. After the system delay, a FFT operation result appears every T; + T units of time for an unbounded number of opera- fions The fixed-array-processor directly implements the signal flow graph of Figure 6.3 with hardwired interconnections between the processors. Each interconnection path contains a regis- ter to synchronize the data flow between processors. The execution time, T E, and the register delay time, t,, determine the processor on , M (T; + t,)log2N + (T; + t,)M-1 50,01) ' Um 1 b0: '- M—ooo b150,) - m Figure 6.4 shows a graph of the FFT data bandwidths for the single-processor, TDMP, and fix-array-processor as a function of the number of operations, M. The curves were generated assuming that N-8, t,,, - 30 nsec, t, - 10 nsec and T5 - (4TH + 2T4) where Tu and T A are respectively the hardware processor floating-point multipliution time (27 us) and addition time (14 as). The curves in Figure 6.4 show that for a large number of FFT operations, TDMP on is more than lS-times larger than the bp of a single-processor system and 1.5-times smaller than that ' of a fixed-array-processor in computing an 8-point FFT. The on of the fixed-array-processor un be viewed as an upper-bound and the maximum on improvement we un obtain over the single-processor with TDMP. In order to achieve this maximum improve- ment we must reduce the interprocessor communiution overhead either with hardware or 103 meoaueuuao Hum we Humane oweuu>u any we :oauocam m we nomuumaaou cutie—econ «.0 .wfim m m Aneouusuuao mo nunaaevz ooomcc— \Ibmwlmu >lmmmIMl mo~ cm can 25:3 2.2:: N? l I | I I \I‘ 20:3 5:. 3.. I'lll no: acnnuUOunoaouunivux.u 1 man eae-~9_ uoaaooOuato_we.a n —:e I.doo.°~ .nzvna 104 reducing the communication protocols that are in software. For example the dash curve shows the improvement in TDMP on if input queues are used. In this case TDMP on is more than l9-times larger than the bp of a single-processor and 1.2-times smaller than that of a fixed-array-processor. This comparison shows that for a small reduction in on of fixed-array-processor, TDMP enhances both single-processor and fixed-array-processor range of usefulness, e.g., increases b0 over single processors and adds flexibility over fixed-array-processor. Figure 6.5 shows the bp’s as a function of the processor execution time for large M. From these curves we un see that the single-processor system has a larger on than TDMP if the exe- cution time is less than 2.8 as and as the execution time increases above 2.8 as this situation flip-flops. From the same curves we see that if T; is greater than a 1000 us, the bp of TDMP and fixed-array-processor is about the same. These results are very important and can be easily explained. In the first use where TDMP on is less than that of a single-processor, the PEs in the TDMP are spending most of their time communiuting and waiting for new operands because the computation time to communication time ratio is small (<.04). In the second use, where the TDMP b0 is comparable to the fixed-array-processor, the computation time to communication time ratio is large (>13) so that PEs spend the majority of their time comput- ing instead of idling. This points out the importance of granularity in TDMP. Granularity is the size task a PE must perform before it is required to communicate with other PBS in the sys- tem. For TDMP the granularity must be large such that the computation time to communicate time ratio is large for a given problem. To illustrate the advantage that TDMP has over fixed-array-processors for this class of prob- lems, we simply change the appliution problem. Changing the appliution problem in TDMP requires no hardware changes. We simply reprogram the data paths and input new information packets with new tag values, assuming of course that the program for the new task has been 105 .cowumuaanu Em menu new use coausooxo mm we cowuunsm a mu cognac—ace pugs—Ema m6 .wE $5 on: 892: 8c...— ccc; E: c. b p p p p d a d d 4 11°.— 1163 coo; u .... n «a 168.2 a a nonnuuouaranuuaivouuu t a N .52. - .... no u: a u a nonnooounruumcun t a A 5 a 106 previously stored in the PE. However, changing the appliution in a fixed-array-processor requires changing the hardwired interconnection paths between processors or possibly experiencing a signifiunt degradation in the on. In addition if the problem size changes unneeded PEs in TDMP architecture un be utilized for multiprogramming purposes. Since TDMP is based on data flow principles and sufl'ers no side efl’ects of concurrent processing, we un execute two or more diflerent application problems simultaneously. There is no way in which fixed array-processors can do this. Beuuse in fixed-array-processors the correctness of the results requires all processors to be synchronized together. As a result, unneeded proces- sors must remain idle, thus reducing the utilization of the fixed-array-processing system. As a second example of TDMP performance, we will compare the bD’s of the three process- ing systems implementing the digital filtering algorithm given in Chapter 3 and shown in Fig- ure 3.8. In the single-processor system 4 multipliutions, 3 additions, and 8 memory accesses and stores must be accomplished to compute one filter result. Thus b - l ”I 4tu+314+81m For the TDMP system, we assume N - 8 to make the architecture fit the problem size and directly implement the signal flow graph shown in Figure 3.8. This results in bng lu+T This by assumes that many filter computations are performed and the multipliution time is larger than the addition time. Similarly for the fixed-array-processor system we assume an architecture size to fit the problem size and directly implement the signal flow graph which results in 107 b - 03 I” "1' I, Figure 6.6 shows the bD’s for each processing system as a function of the processor multipli- ution time. In this example we see that the bp of TDMP is larger than that of the single-processor for all values of processor multipliution time greater than 15 nsec and less than that of the fixed-array-processor until the multipliution time exceeds 500 nsec. The start- ing point of 15 nsec was used so that our equations for on; and on, will be valid, since tA < t” is required for these equations to be correct. The results from this analysis again shows us the efl‘ect that communiution overhead has on the TDMP b0 and the amount TDMP enhances bD over single processor systems. 6 .3 Discussion of the Simulation The simulation model was the driving force behind designing the specific com- ponents and the interrelationship of these components to make a TDMP system work. By using a simulation model of the design, we did not have to purchase, build and test printed cir- cuit boards and the LS] components on these boards to observe the performance and operation of the TDMP system. With simulation we were able to test new ideas quickly and to make changes easily. We also were able to develop the precise timing relationships for the system operation and determine what influences these relationships. We did not find out how TDMP performs when the system hardware components are not ideal but vary with temperature and age. We were not able to truly simulate concurrent PE activity beuuse the simulation proces- sor was sequential. The upacitance of the bus data paths that influence signal propagation could not be simulated inexpensively. To simulate this upacitance we need a stix layout of the intergrated circuit design of the TDMP architecture in order to determine, at best, a good guess of the bus lengths necessary to ulculate a upacitance value that could be simulated for each bus data path in this model. This approach is very expensive, time consuming and not 108 .mcoaumunasoo wcauouawu Heuwwav may new mafia newuouwfieauase we aofiuocsw u no nuvfiavcnm 0.0 .wfim ..10 «gas 000.00“ 000.0“ coc~ oo~ m~ l» xvi . . _ a q . q a 1 ad raou 18:. H ma uonuououarhnuuutvuuuu I a . N 38.2 area i on an nonnooountuaucun r 0 78.2: 2.5.... 109 necessary for this initial simulation model, so it was not done. However this should be done when simulating the architecture for actual VLSI implementation to study its efl‘ect on perfor- mance. 6.4 TDMP and Dennis Data Flow Models There are two primary differences in the TDMP data flow architecture and the data flow architecture developed by Dennis at MIT.37 The first difl'erence is that in TDMP the data flow operators and memory cells are in the same PE module with an interprocessor communiu- tion network used to interchange result information packets between PEs. In Dennis’s model the operators and memory cells are separated. All information packets are transmitted through an arbitration network from the memory section to the operators. And all result information packets are transmitted from operators through a distribution network to the memory cells. The TDMP approach does not increase the amount of concurrent activity over Dennis’s model but rather allows for loulity of information flow. This approach reduces the amount of parallel- ism in a problem required for high PE utilization. The argument about loulity of information flow is true for any multiple processor machine in which each processor holds a part of an overall program. However, two factors increase the likelihood that such good decomposition un be achieved for TDMP. First, since TDMP is based on data flow principles, operations are constrained only by the availability of data. Tolerance of asynchronous behavior at the architec- ture level allows greater flexibility in mapping programs. Second, since data flow languages are free of side efl'ects decomposition is an easier task to perform. The second difference is that TDMP uses time-division communiution techniques for the transmission and switching of information packets and Dennis's model is designed for space-division techniques. The time-division techniques are better in conserving valuable chip area over the space-division techniques for single-chip VISI implementation. And the use of an intelligent switching net- work in TDMP allows for the possibility of performing operations other than “plain old switch- 110 ing'. For example, the switch could modify the data as it is being transmitted to the destination PE or implement some of the system operating functions to improve reliability and diagnosabil- ity in the system. CHAPTER V11 CONCLUSIONS 7.! Summary Advances in improving the computer’s computational speed result from reducing the switching delay times of elementary logic gates, improving methods for performing primitive arithmetic operations and designing computer architectures that overcome the intrinsic speed limitations of the von Neumann machine. While the first two approaches improve the compu- tational speed, the third approach provides the potential for making the largest increase in com- putational speed. Here, multiprocessors are employed; and these multiprocessor architectures exploit the inherent parallelism in specific tasks, thereby eliminating the single-processor bottleneck -- 'von Neumann bottleneck“. In addition, VLSI is making it feasible to build these high-performance multiprocessor structures as low-cost, flexible, single-chip system com- ponents. So, the purpose of the research reported here was to investigate a multiple processor computer structure that exploits this technology to improve the bandwidth of comparable single-processor machines and increase the flexibility of existing fixed-array-multiprocessors. Within the research reported here, the investigation attained several objectives. First, it sur- veyed the previous work on multiprocessor architectures to identify their principal attributes, and it suggested how well suited these attributes are for VLSI implementation for enhanced computation. Desirable multiprocessors VISI attributes include: — majority of the processors are identical; — processors and memory all in close proximity; — simple underlying communiution geometry; 1]] 112 — regular and modular structure; — few interconnection data paths between processors; — flexible datapaths. Next, it presented an architecture called TDMP (Time Division Multiple Processing) and described how it achieves the following general properties: — greater bandwidth than comparable single-processor architectures; — less complex switching network than a crossbar switch interconnection network; — more flexible than fixed-array network; — full-access and non-blocking interconnection upabilities; — simply extends to pipeline operation; — highly compatible with data flow algorithms; — amenable to VLSI implementation. A computer based simulation model of TDMP was developed to investigate the computa- tional capabilities, hardware components and the interrelationship of these components with the others. Finally, through two algorithms that represent the important class of recursive algo- rithms used in digital signal processing, it compared the computational data bandwidth of TDMP to comparable single-processor and fixed-array-processor systems to show how much TDMP improves arithmetic computation bandwidth over single-processor systems and how much bandwidth we must give up to obtain flexibility over fixed-array-processor systems. By satisfying these objectives, the investigation achieved the overall purpose of the research pro- ject. During this investigation the research project achieved several key results. First it described, from a hardware point of view a basic TDMP architecture with twelve processing 113 elements, two time-division switching networks and a control unit. The processing elements perform the arithmetic operations and synchronize computation for correct execution at the architecture-level using data flow principles. The time-division switching networks were shown to have full-access and non-blocking interconnection upabilities for switching the information packets and flow control signals among the multiple processors. The switch was also shown to possess several important attributes that make it well-suited for VLSI implementation. These attributes include simple underlying communication geometry (time-division multiplexed buses), modularity regularity and logiul simplicity. The logical simplicity implies that incre- mental changes in the switch architecture (e.g., increasing the number of processing elements connected to the switch) the degree to which the changes afl'ects the software in the switch is small. Next, we explained how TDMP executes programs based on data flow principles in which computations are allowed to proceed as soon as it operands beume available. The idea of organizing a computer to operate on data as soon as its operands beume available has been discussed by Dennis, Gurd and Watson, Arvind, Miller and Cocke.36’37 However, none of these authors has suggested a detailed and efficient scheme for communicating information packets to processors for computation. This investigation has shown through simulations that an integrated interprocessor communiution system using time-division multiplexing and time-division switching offers an attractive solution to this problem. Also it was shown that TDMP is unique beuuse time-division communiution techniques have not been previously investigated in data flow or conventional multiprocessor architectures. To develop TDMP, we built and tested an APL simulation model of the architecture. The investigation found that the use of such a model was helpful in the design and analysis of the architecture. Using the performance data of a single PE TDMP model, it was shown that TDMP un improve the data bandwidth (computational speed) over single-processor systems by factors of 15 and 10. when computing FFT‘s and digital filtering results, respectively. The analysis also indiuted that in order to achieve this improvement the granularity of tasks for 114 each PE must be chosen such that the computation time to communiution time ratio for each PE is large. The investigation also revealed some negative attributes and deficiencies of the simulation languages available for our simulation purposes. The negative attributes, common to all of the simulation languages considered in this investigation, result from these languages not being able to truly simulate concurrent processing or asynchronous events. The reason for this is these languages are based on a single-processor executing instructions sequentially. The deficiencies of each simulation language varies with the language type. For example, APL does not have a built in timing simulator but GPSS does. However, GPSS is not particularly suited to the translation from a hardware representation to a computer program but APL is. The negative attributes and deficiencies of each simulation language considered in this investigation is discussed in more detail in Chapter 5 of this thesis. Using the results of this study, designers of multiprocessor systems will be able to create advance systems with increased computational performance over single-processor systems, a wider range of use over fixed-array-processor systems and a hardware design that copes with interconnection data path complexity imposed by VLSI implementation. Second, since TDMP is based on data flow principals, i.e. operations are constrained only by the availability of data, the architecture allows greater flexibility in mapping programs and since, data flow languages are free of side efl'ects decomposition of programs is an easier task to perform. The second results will aid programmers in developing parallel software to keep the parallel hardware fully occu- pied. The third result is an APL computer model of the TDMP system which provides a model for developers of multiprocessor systems to use in investigating the computation bandwidth of TDMP if some of the parameters of the structure are varied. For example the bandwidth for a specific problem can be investigated as the number of time slots, the time slot size and the number of multiplexed buses is varied. Currently, the model has the flexibility to vary the number of time slots, the time slot size and the control software for both the PBS and time-division switch. However, it un not vary the number of time-division multiplexed 115 (TDM) buses. This feature can be implemented by duplicating existing parts of the model and adding a new time multiplex switch part. This change results in a new time-division switch with multiple input and output lines. The new switch can switch data between processors on the same or different time-division multiplex bus. The changing of the number of time slots on a TDM bus or the number of TDM buses correspond to varying the N and M parameters of the model, respectively. And finally, the use of an integrated communiution system based on time-division techniques has not been previously investigated in multiprocessor structures for enhanced computation. So the results of this investigation provide designers with a new trajec- tory for multiprocessor design and implications of using these techniques in those designs. 7.2 Further Research As with any research project, the investigation reported here points toward several areas of additional study. Further investigation un be made into showing new options possible in multiprocessor architectures as a result of using time-division communiution techniques. For example, since the switching network is a C-move processor, we un now think of ways to enhance computation by modifying the data as it is being switched. We may also be able to modify the switch to improve reliability and diagnosability in the system. The research un be extended into analyzing the data bandwidth, hardware and software requirements of TDMP if input queues or buffers with arbiters are added to the processing elements. This corresponds to making the processing elements into multiple input data flow operators. Research into dynami- cally allouting multiple time slots to PEs un be investigated. This improves the time division multiplex bus bandwidth utilization beuuse unused time slots uused by idle or non- communiuting PEs un be allouted to PEs that need to transmit or receive data more fre- quently. Research into the modularity of TDMP un be further investigated. By employing the concepts of time multiplex switching and pipelining larger TDMP systems un be developed from simpler TDMP systems to solve larger problems. And, finally, further research into the 116 development of higher level programming languages are needed. The underlying trouble with most current attempts to use parallel hardware is that they are based on traditional concepts of programming. These concepts in turn are based wholly on the serial von Neumann computer design, with instructions executed one at a time. In view of the nature of parallel hardware sys- tems and the practiul difficulties of keeping them running at full speed, we suggest further investigation of data flow programs based on the TDMP computer model. REFERENCES REFERENCES . R. Sugarman, “Superpower Computers,“ IEEE Spectrum, April 1980. . C. Mead and 1.. Conway, Introduction to VLSI Systems, Addison-Wesley Publishing Com- 10. 12. 13. pany, Inc., 1980. K. Hwang, Computer Arithmetic, John Wiley and Sons, Inc., 1979. S. S. Reddi and E. A. Feustel, “A Conceptual Framework for Computer Architecture,“ Computing Surveys, Vol. 8, No. 2. June 1976. . A. Mohsen, “Device and Circuit Design for V1.81,“ Caltech Conference on VLSI, January 1979. G. A. Jullien, “Using ROM Arrays to Implement Computer Arithmetic,“ IEEE Computer Arithmetic, 1979. . C. Mead, “V1.81 and Technological Innovation,“ Caltech Conference on VLSI, January 1979. R. Benard, “Computers: Emphasis on Software,“ IEEE Spectrum, January 1980. Intel Corporation, 8086 Family User‘s Manual, October 1979. Intel Corporation, 8087 Co-processor, private communication, May' 1980. R. Friedenson, “RC Active Filters for the D3 Channel Bank,“ Memorandum for File, Bell Labs, December 1971. K. Thurber, “Associative and Parallel Processors,“ Computing Surveys, Vol.7, No.4, December 1974. J. P. Hayes, Computer Architecture and Organization, McGraw-Hill Book Company, pp. 209-236, 1978. 117 14. 15. 16. I7. 18. 19. 20. 21. 22. 23. 24. 118 W. Wulf and C. Bell, “Cmmp-~A Multi-Mini-Processor,“ Fall Joint Computer Conference, pp.765-777, 1972. F. Briggs, K. S. Fu, K. Hwang and J. Patel, “PM4--A Reconfigurable Multiprocessor Sys- tern for Pattern Recognition and Image Processing,“ National Computer Conference, pp. 225-265, 1979. W. Wulf and S. Harbison, “Reflections in a Pool of Processors--An Experience Report on Cmmp/Hydra,“ National Computer Conference, pp. 939-950, 1978. J. R. Jump and S. Ahuja, “Eflective Pipelining of Digital Systems,“ IEEE Transactions on Computers, Vol. C-27, No. 9. September 1978. C. V. Ramamoorthy and H. F. Li, “Pipeline Architecture,“ Computing Surveys, Vol.9, No. 1, March 1977. G. H. Barnes, et al, “The Illiac IV Computer,“ IEEE Transactions on Computers, Vol. 017, pp. 746-757, August 1968. K. J. Thurber, “Parallel Processor Architectures,“ Computer Design. pp. 89-114, February 1979. I(. T. Kung, “Let‘s Design Algorithms for VLSI Systems,“ Caltech Conference on VLSI, January 1979. A. L. Davis, “A Data-Driven Machine Architecture Suitable for VLSI Implementation,“ Caltech Conference on VLSI, January 1979. B. W. Arden and H. Lee, “Analysis of Chordal Ring Network,“ Workshop on Interconnec- tion Network for Parallel and Distributed Processing, April 1980. M. A. Franklin, “VLSI Performance Comparison of Baynan and Crossbar Communiutions Networks,“ Workshop on Interconnection Networks, pp. 20-28, April 1980. 25. 26. 27. 28. 29. 30. 31. 32. 33. 34. 35. 36. 119 F. Briggs, PM4, Private communiution, May 1980. H. J. Smith and D. S. Smith, “Study of Multistage Interconnection Networks,“ Proceedings of the Fifth Annual Conference on Computer Architecture, April 1978. D. M. Dias and J. R. Jump, “Analysis and Simulation of Bufl’ered Delta Networks,“ Workshop on Interconnection Networks, April 1980. M. C. Pease, III, “The Indirect Binary n-Cube Microprocessor Array,“ IEEE Transactions on Computers, Vol. C-26, No. 5, May 1977. R. G. Hintz and D. P. Tate, “Control Data STAR-100 Processor Design,“ proceedings of 6th Annual IEEE Computer Society International Conference (COMPCON ’72), San Fran- cisco, CA, pp. 1-4, September 1972. R. L. Sites, “An Analysis of the Cray-l Computer,“ Computer Design, January 1979. M. Satyanarayanan, “Commercial Multiprocessing Systems,“ IEEE Computer Society, May 1980. A. E. Joel, Jr., “Digital Switching-«How it has Developed,“ IEEE Transactions on Com- muniutions, Vol. 027, No. 7, July 1979. H. J. Siege], “Interconnection Networks for SIMD Machines,“ Computer, pp. 57-65, June 1979. D. Tabak and G. J. Lipovski, “Move Architecture in Digital Controllers,“ IEEE Journal of Solid State Circuits, Vol. 8015, No. 1, February 1980. I J. B. Dennis and D. P. Misunas, “A Preliminary Architecture for a Basic Data Flow Proces- sor,“ Computer Architecture, pp. 126-132, 1975. J. Gurd and 1. Watson, “Data Driven System for High Speed Parallel Computing - Part 1: Structuring Software for Parallel Execution,“ Computer Design, June 1980. 37 38. 39. 41. 42. 43. 45. 46. 120 J. B. Dennis, “First Version of a Data Flow Procedure Language,“ Project MAC, MIT Cambridge, MA, 1974. J. Rumbaugh, “A Data Flow Multiprocessor,“ Transactions on Computers, 1977. J. Gurd and 1. Watson, “Data Driven System for High Speed Parallel Computing - Part 11: Hardware Design,“ Computer Design, July 1980. M. Malek and W. W. Myre,“ Figures of Merit for Interconnection Networks,“ Proc. of Workshop on Interconnection Networks, April 1980. H. Levin, “Introduction to Computer Analysis: ECAP for Electronic Technicians and Engineers,“ Prentice-Hall, 1970. T. J. Schriber, “Simulation Using GPSS,“ Wiley, 1974. P. A. Bobillier, B. C. Kahan, and A. R. Probst, “Simulation with GPSS and GPSS V,“ Prentice-Hall, 1976. G. A. Blaauw, “Digital System Implementation,“ Prentice-Hall, Inc., 1976. L. Gilman and A. J. Rose, “APL An Interactive Approach,“ John Wiley and Sons, Inc., 1974. Control Data Corporation, “APL Version 2 Reference Manual,“ 1979.