5 ' " "'
(3 "' '1I"\I"~°
I:(|;I'1 "‘3‘,“ jg." ~5'.I ‘

.. I555 5m“ . 5:55de 5 .

£555
.. .,5-45 - ‘5 ’5' ..'.".'
5 555 'I5 ...'5IRI5YJH%II '55%b@55
"I-"". é:""'55}b( ." ." -5 '.' 5,5 'J';flu* 1m . Ivz‘vqu'fgzﬁr'}; 'l'-".l.5 "'""'¥:
5 "“5 5555.. 55 :' 55:: 5155-55555 .‘5 .
5-. .. . 5.,«55. . 5-“. I I. 55.55"“. '3" . ,. .
. (III IJIII. 9‘ ‘.7W" (5’5- : ‘

5 u”. ' UIt'I ‘ .. 5"}.5 ,. '. - " ". MIC. ' '1 ' ' . '
1.? '5 .Iiﬁ'ﬁ-quu'k' r '55'..'..‘5."""'5'."',"I“~I-.""'1"-"'I"95}' I§5¢-"..'55551«"‘ '55' .- 1%" ..I'-i.“¢'.<;~.'.'5;"~‘"2 9'? 53%“ a:
. .~ :5 .'5'5 55 .. .:'.".I:II.' r554; : .I. .I.' .- .\ P" “ 5'. 4 - I.;

' ""5'"1"-~"“ "-"" I".:""ﬂ':°."5" '1'3"'I' “IQ is" '.' ' : 3" W225! '52": 55" WEEK . ""5. m“

"1
, ' .W
r I"-','\"\'."7.I '.‘] . 5'
$5 . L.

W” ". ;v1I-’.'--_I ’ u »- 1 .
{gfﬂwﬂw III... I. 55"" 55"";3. 5'5 If. ""1" A35" Ihg'bu

, . , .5555... 5...... 5 ""
It"; IJII‘V 55%; 1'" "L I "Cipvo ' 5'5"" I"'"I?5[I"7."'"T,""~'IIEFII"C""3'11""):II v";-

    
   

”Ml
155M35354€55'9~555“'55“,“#-: I55

     
   
 

  

   

5 5, 5. I'm,‘ '.. .I f' 35:51.. ".55'13'J3I5 5:. I45" .. '5. 5 . _'1I"Jéél'."-a;:lf\" F2 :de 5"..." - . 5554’“ 5 5
. f I III. {5. 'ii'U'IHI". "3".“ ' '4'“. $ IIAI'I . ”515%“. ‘_" 5'} 5‘( if?” P“ (J:II I 5.;(J?;,%&{r:.;“‘\~ea '33:": 3”};‘5’1 [My 2‘! "1

Jgu‘l“ "5." '-5" 5 '."".'5:":

\ "h" "I"~"'1"'1""Si' "I 2': "'.' "' ISA“ '.':P- ' ' ‘i'll't'ﬂvﬁ' "‘itﬁe hﬁnzégw
'é'ﬁ" ﬂux -“-.\' $35K {WI ",55‘W. ﬂu?“ '.'"; I ' I56; . ',1' """". '"75'5 "
... . . '..I5I""'\ "A" '. "".5‘ " '5. . ' . . "I “.55" (555.13” "-- "
.'o" ""-'-'I"\' '.'" 'd'f’ '51] "'“byﬂuﬂfﬁ'h '515I'§",~"""""'f“ 5. r5'.', "'.f"‘":""'1‘~‘>d'r¢" $65“ 7"
. . I J'III'I'IV 55 .1" 'H5 "3 I ""33"" "I:,'I\.:VL.'L."‘: ['.I'I'. I355 -P ."I'I "5.55 "'{T-.'.;I;..'-..'_"}'.-'5 ..
."3'<7II"-..§5'3'1. 515'!” ,. ”5951'.'¢5.'r'9.5575 :5: II. .. 3-4.5; “5515315555153...
'55,. 555' 55* : "1.55.55 " “ ' NW 5155;, I‘I: 5' '~ I ,. 515555..

5; ( 5'. ‘ I ’III" ;'. :‘C' '
. '.'I'T‘. f‘g . '5'".

   

 

 

 

 

 

 

 

 

.
- a

‘I t :."x'\'5‘"':.n:‘"‘1'"\a"" 'd‘ “D .g
" .5'." I 5 5"3 I"'aI:' ‘7‘; 1')va '.‘ 32"" 3"\
"'".""',"E:.!‘."'I; 1.:ng I'Mt' 1‘3, '. {Eek—f" .blé}: ,II'E‘N ‘.‘ 55,-55'W‘5e ”VF'QW" -. .'5 "'.' .ImLfr-."'" {WV . I

.5 J! I ,5 . '1'“ on 5

l '5'" "'J+ "'11" '.I'I {-.. 9",. IL? {I ”9"“ V 1'5" ‘11! [5‘ "'"'J"I"’ .I“ '1’.I:’;)v ... KT: [’Urrwﬁ
'5' .I .I.. I,. “2.3; .55- I. 555155553 In * '.I " .5,

wgdbhﬂ' i‘d y‘j'bb. , '1'" "‘IJI' 31”,." '5"? '"'-'I'2':"' I'll INN ."."' 5' (V 13"“.er W” '.‘

 

'.'
I o I" I" I 5"“ II III
.:, I'I’Sf'w '.’.1""' ‘ 5...". 451:5“. "5I5I5’.5cu."."’""‘ ‘.'. """ """""""'I' ""3"" ML"
5.“ 8"". '29.! M‘II‘ 'I'..'I°.,"|'\E‘:“I.¥4l 3"“ ""g""
"'"I': 'I."‘,' 5I5 2'5“, '.':Ilt'd ‘ 1". "' " p" “"""'4\' 41' {'{J'V'JCI‘ WI)
" ' ".‘f' '~.'.'.;-:"' ,IN“.

5:", 5'"
I" ' | I ' ,'
' .. i" 5 55"I“'-'.:.~I5»'5..13.5555 I...
.H 5’5"“; '.'. I!5I’-.‘-.~-.'T555:-.~.5555: ‘ 535 5.. 5.55.. I. . “5555' W 5-55" '.

“I".

J 5 ., 1 5 "II‘II ‘1. . '.I I . IIII'I'IIL'UL; [\an‘r'tv‘:’
J 1"rﬂ’t 115‘ '5". u. I'II:§HI‘ '.‘?“ ta ‘9: :5 ' . [It‘d/i‘le 0‘9": #1; I“ '.' ‘ v’b":1';\.(I ;":‘:" it?“ I :I"' '
"' ' " '.'-.3 .. (Ir V‘ "5 '5"555 ‘.' fox" “3"
. '.'“ '5' "k"; 0' ‘9']. 3310‘ —PH'. J'y;v'I :z'I ".I J“; (E‘;' *0,“

.o‘u-o I I'I'I .' """'5 '."...r ‘.‘ I: I
"."h'vw‘k." It. I”. "..|"I'f"‘""':. '. '.I .. ,."'Q$"'."f.\-' 51:15:: (35:5 - ”ix,“ "\KIIII."

     

W". vv V "0-7:

 

 

"'5 5"; “5‘“ I”, 5'
@AW '.' ‘.‘K 'Rnu'm 5"}:J": '.IJ. 'II' 5"{I'

""71' .
' ":" '.'l‘ |' 'C " I"" I." i I I r ':' I. "s": ‘.'I"' '.‘ “:2; r'kl 1"ll . ﬁ‘ IQII' :"I LI—ﬁ' .
I "'I '8 .55' ': "1"""" . "'. ?-""-"""" " . 5'“ ”5"": ""' I'"""l "' r." '3 ll!" .. 'I. .I'5 'H H5
. I' 1 , . II . 1.. . "."": ”‘.'?" II'H'I‘II '.':iIl‘I v!l"'||.. 51"“ ""i'l"," VIA ll. .I5 1‘1"“.th 3" ' IS

' _ .
.
é—
5,55

-. 5 nw~5 .... ,5;.5.55555 -. 5 5
5‘, I ' ,II'JH 5 II}, If,“ I n . 5 ' u' 1).} " , 5' I‘.‘ "5 ‘ I; '2‘ .‘I n . ..‘, H. ., .
55' '5' ','. '. "1'." 5" . I‘I' ‘ ’5?" oI'VI" ~. 5' 'l-I’I' $35 5;. -. ' "I": .. '.'ﬂt': ‘ '.' . 2'1""? . 9515':- _ ‘;-- ""'
' I h': II '.’ '5'5 ' .' . ' V "‘.‘ ‘J o‘ . 'cﬂ'n'r":"".' LI ' "' ' ' “I" ’ ' ‘
' .‘I'I “”55' ('l 5 ' I Q I L ' ' y— .“W ‘ { (”I'Lf" :ﬁl'd'I-‘Nv I H ' i > | . I
J 5,5” I. "5' :'5 I, ‘ .. . . .5" '.'??? ‘5:. ’:"II Ii." "'. 'L-I A" . 5' 5 U 5, - 4< 0 o
' 5 ' :J.fr\ "3' .0 ’ ' ' ' . ' . ,\ ' \." .IILW. ‘.’} .I "' I‘QK'J. b . _
55 555I~55 . 5 «I.;- ..I.5-5I-gw, .., ‘".5II._
5" 5 I o. ' .5 '5 5 3'5 I "I (I. "' ' 5 55:41“, ' ‘ ‘ ' .' ':"’ 5
' ,I' 1.“ ' n ' ’ ' f ' ’5 a "" 'o“' ' I
1" I ”if?" '.':b“ " I f" u 'l ' 1' : f "f ' I I "l' 5"" 5" I
I . 5 I '.. 0‘: [35.25 |"'"-' A' ,I 5' 0 U I
'.' . "'. IO 5 ."5""""' '.. " . 55' I t' ‘ : . ' ("' {3'53"}. 5‘ :z',. : .II . ' ‘ '
it i ' I. ' .5],1 ’. . . r . , 0 :. I I ' I'I 'II5 ::o:l}‘. 1h); :\ ;:-::|'. 55' "$. ' :5 ... '.‘.
5 '.'I . '55.? I. 5'. I. 3555'». . '.f. '. 515-55.” 35"} I"""" ‘ ' “3- " '
5 5 ( II 95 '."(I '55. ' "'I I. ' z ". \ ' T'. '. : '5', 5 . "(‘5' '55. ' " "'.' ! pt" I: ” "
5 ‘ 5 .' '. 5 5 I: 5“.;:' 5. . .' '.'“ , I .. 5. .5‘ '.. ‘5 “5 IElla... I': L. . I- a '.' ’I‘II'I15":' ..
. . , , . . .5 .5 » . . . ...-r . ..5 ~ I. .555 .5
55' I . ',. " .5 "II' b ‘5'." "' ll " I',' .' "'"E'.'.'" “Us. "" ". .""I' " " I'"*"I ‘, '
:1 . g 0 ""1'. ,I' " .IO ‘ “g . ' I. . .f I ‘5'." III. II I..." III» 1 f'I'J, l . . 5 J
' 05 " u "5'" ""':\ In“) 5" ‘n-l ' ' 5". '. .."'."". -"" ""5"," ’1" ~ "215'”- "'5 ' .5"
. . o . "b ' ".v ' "' 5"",I: 0 db ”WI-f ' {'5 ~' " ’.' ’3'" '5' . '1' "ts-'3')" "
i I '5 . ' ‘5 «'."5'5. . ’. 5: , .5 'I 5 .A' .5 J55" .i.I..'I"-‘...II' ' .5 .. ..' ‘5‘5
' ' I. . I I'll 5'I:"I'I 5. ' 5 5 5... II‘ . :5 ' ;','.'"" "°".".'. '1 . 5 ".". 5, 'I ":35'5 I ."' 'I "II . . 55 . 5 I
I ""1"" ""I 31'.” .5 5 :1. 5 " .5. 5:?5 .. 5' 5' " .1’5' {I =' I"‘l'~":*"' 1:515
a . I 'I' . I
'b

¢
;.:'~-I. "' 5.555.. ,. '; ..
T'- ."" ,1' ' '.'T'IU-Hw' . M55551 I'quII '.'.,,J"I' : ..- . 5"‘J'I. "'5' 5.5515155 5.. 5.5. .
'IIEIIé‘vI-Iw -~.5 ".'."';I5 'Ii'"I""""" 5". '3"'I"I'I'J','}" I'.""'§'5: 5‘: 'I. 555 ' I'W‘J IL" """I"
5555., '.-:5:‘:...'i:2=55:-.~"5I:I'Ll 5.1.5 51.. 15‘. '..‘ .5555»: 5155555559"W.'2'55IW

 

15¢»-

 

 

TH ESIS

ﬁn"; aka-(mm. .‘.‘.......__."'" u" f I
t.

' L I R R A R Y '
University

. 4'31.

   
    

as.)

‘

 

This is to certify that the

thesis entitled

TDMP: A DATA FLOW PROCESSOR

presented by

George Henry Simmons

has been accepted towards fulﬁllment
of the requirements for

Ph. D. degreein Electrical Engineering

,ﬂu 2W

Major professor

Date February 10, 1981

 

0-7639

 

‘»

 

OVERDUE FINES:
25¢ per day per 1m

 

' l::\\\\ k ‘ .
,(éum L RETURNIE LIBRARY MATERIALS.

” ' 1 Place in book mtum to move
w charge from circulation records

 

 

TDMP: A DATA FLOW PROCESSOR
By

George Henry Simmons

A DISSERTATION

Submitted to
Michigan State University
in partial fulﬁllment of the requirements
for the degree of

DOCTOR OF PHILOSOPHY
Department of Electrical Engineering

and
Systems Science

1980

I
but"

\_;,/

ABSTRACT
TDMP: A DATA FLOW PROCESSOR
By

George Henry Simmons

This research investigates a multiple processor computer structure that has a higher
bandwidth than comparable single-processor machines yet is more ﬂexible than existing
ﬁxed-array multiprocessors. The basic approach taken here was to develop, model, simulate,
and then analyze a computer structure called TDMP - an acronym for Time Division Multiple
Processing. TDMP‘s interprocessor communication network is based on time-division
multiplexing and time-division switching techniques and has the following general properties:
greater bandwidth than comparable single-processor structures; less complex switching network
than a crossbar switch interconnection network; more ﬂexible than ﬁxed-array networks;
full-access and non-blocking interconnection capabilities among the processors; simply extended

to pipeline operation; and, ﬁnally, amenable to VLSI circuit implementation.

The justiﬁcation for this research investigation is ﬁve-fold: First, single-processor systems
have intrinsic bandwidth limitations known as the 'von Neumann bottleneck.“ This bottleneck
is due to the word-at-a-time style of processing through a single communication channel. As a
result, computer structures based on this machine design can not exploit the inherent
parallelism in speciﬁc tasks. Second, while ﬁxed-array-processor structures can exploit this
parallelism. their range of usefulness is limited by their ﬁxed interconnection structure. This
results in performance degradation when the task structure doesn’t match the physical
structure. Third, Dennis, et al., have designed computer structures based on data-ﬂow

principles, but they have not explored in detail alternative interconnection networks suitable for

George Henry Simmons

data-driven computation. Fourth, time-division communication techniques have been
successfully applied in the telecommunications industry but have not been investigated for use
in multiprocessor structures. Yet this technique appears at ﬁrst glance to hold good promise
under certain overall computer system constraints. This fourth justiﬁcation leads directly to the
ﬁfth. Namely, with multiprocessor structures implemented on VLSI circuit chips, the
interprocessor and inter-memory communication paths require a disproportionate amount of
chip area when compared with that required by the processors and their memory. So the
communication channels determine a signiﬁcant portion of both the cost in chip area and chip

speed.

With all of these considerations in mind, the TDMP structure was deﬁned, and APL
computer simulation model was developed. This model formally deﬁnes the hardware
structure, as well as the timing, switching delays, and communication protocols. The model
also serves as the basis for analyzing the characteristics of the computer structure and for
testing its usefulness in handling various multiprocessor tasks. We limited the model to a

sixteen-processor structure.

Results of this investigation show that TDMP indeed has a higher bandwidth than
comparable single-processor machines. Sample computations show that the speciﬁc. TDMP
architecture considered has ﬁfteen and ten times the bandwidths of a single-processor system
when computing eight-point FFI‘ and second-order digital ﬁlter results, respectively. However,
in order to achieve these bandwidth improvements over single-processor systems, the
granularity of the tasks performed in each of the processors must be large. Results of this
research also show that TDMP has greater ﬂexibility than ﬁxed-array multiprocessor structures.
This is due to the non-blocking, full-access interconnection capabilities in the TDMP structure,
which allows the data paths to be reconﬁgured for new application programs. Simulations also

reveal that time—division techniques can be exploited to route information packets in a data-ﬂow

George Henry Simmons

structure with simple operating system constructs. Moreover, these time-division
communication techniques do signiﬁcantly reduce the V151 circuit chip area devoted to data

paths in data-ﬂow structures.

ACKNOWLEDGMENTS

Never has one individual owed so much gratitude and sincere appreciation to so many
peOple for help making a dream come true. Most notable of the many people is
Dr. P. David Fisher, my advisor and thesis committee chairman. I would like to personally
thank you for your guidance and support in achieving this ambitious task. You are a gifted
professional and your contributions should not go unrecognized. 1 would also like to thank my
thesis committee members for their suggestions and guidance in my doctoral program:

Dr. S. Crouch, Dr. J. Forsyth, Dr. H. Hughes, Dr. J. Kreer and Dr. R. Reynolds.

Above all, I would like to thank my wife, Grayce, and my son, Gavin, for the support,

tolerance and encouragement they gave me while achieving this task.

Chapter

II.

III.

IV.

TABLE OF CONTENTS

Page
INTRODUCTION 1
BACKGROUND 4
2.1 von Neumann Bottleneck 4
2.2 Multiprocessing Survey 8
2.2.1 llliac IV 8
2.2.2 Cmmp 10
2.2.3 PM4 14
2.2.4 Indirect Binary N-Cubc 16
2.2.5 Star-100 21
2.2.6 Cray-l 23
2.2.7 Systolic Arrays 23
2.2.8 Other Commercial Multiprocessors 26
2.3 Observations 26
TIME-DIVISION MULTIPLE PROCESSING 31
3.1 TDMP Architecture 31
3.1.1 PE Array 31
3.1.2 Time-Division Switching Network 37
3.1.3 Control Unit 42
3.2 TDMP Implementation 43
3.3 TDMP: A Data Flow Processor 44
TIMING CONSIDERATIONS 51
4.1 Timing Diagrams 51
4.2 Maximum Multiplexing and Demultiplexing 53

Rates

4.3 TDS Maximum Switching Delay 55
TDMP SIMULATION MODEL 62
5.1 TDS Simulation Model 64
5.1.1 TDS Control Unit 69
5.2 PE Simulation Model 74

5.3 TDMP Simulation Model Operation 83

iii

Chapter Page

5.3.1 PE Operation 83
5.3.2 TDS Operation 86
5.4 TDMP Simulation Model Summary 90
V1. TDMP PERFORMANCE EVALUATION 92
6.1 Performance Data 93
6.2 Comparison of Computational Performance 98
6.3 Discussion of Simulation 107
6.4 TDMP and Dennis Data Flow Models 109
V11. CONCLUSIONS 111
7.1 Summary 111
7.2 Further Research 115

REFERENCES l 17

iv

Figure

2.1

2.2

2.3

2.4

2.5

2.6

2.7

2.8

2.9

2.10

2.11

3.1

3.2

3.3

LIST OF FIGURES

Alternative implementation of the 2nd-order
difference equation -- non-von Neumann

style processor.

A block diagram of the structure of ILLIAC IV.

A block diagram of the data paths between
processing units in ILLIAC IV.

A block diagram of the CMU Multiminiprocessor
(Cmmp) architecture.

A block diagram of the PM4 architecture.

A block diagram of the indirect binary N-Cube
array architecture.

A block diagram of an individual switch node
in the indirect binary N-Cube array.

A block diagram of the indirect binary N-Cube
array control system.

A block diagram of the Star-100 data paths.

A functional block diagram of the Cray-1
architecture.

The hex connected systolic array for
matrix multiplication.

Block diagram of TDMP architecture.

Block diagram of 121:] PE array and
control unit.

Block diagram of a TDMP processing
element.

Page

ll

13

15

18

19

20

22

24

25

32

33

35

Figure

3.4

3.5

3.6

3.7

3.8

3.9

3.10

4.1

4.2

4.3

4.4

5.1a

5.1b
5.2
5.3

5.4

5.5

5.6

Example of incoming and outgoing PE
communication paths.

Example of time slot interchanging
in the TDS.

Example of both space and time switching
in the TDS.

Block diagram of a single input/output
time-division switching network.

Data ﬁow program graph of the 2nd-order
difference equation.

Example of a data ﬂow branch operation.

Example of higher-level conditional
expression evaluation.

Timing and control signals for a 16 time
slot TDMP system.

Matrix representation of the P and Q control
signals.

Flowchart for TDS control algorithm.

An example to illustrate the A N versus
AzLogzN switching delay issue.

Block diagram of the TDS APL simulation
model.

Part of the TDS APL simulation model.
TDS simulation model memory map.
Block diagram of TDS the control unit.

Addressing modes for the TDS network
C-move processor.

Control unit logic network speciﬁcation
table.

Examples of TDS processor move operations.

vi

Page

36

38

39

40

45

47

48

52

54

56

59

65
66
68

70

72

73

75

Figure Page

5.7a Block diagram of the PE APL simulation

model. 76
5.7b Part of the PE APL simulation model. 77
5.8 PE states for data ﬂow computing. 80
5.9 Logic truth table for the outgoing

part of the CILU. 81
5.10 Logic truth table for the incoming part

of the CILU. 82
5.11 Flowchart of the PE control algorithm. 84
5.12 PE control program code. 85
5.13 Flowchart for the TDS control algorithm. 87
5.14 TDS control program code. 88
6.1 Block diagram representation of a

single PE TDMP model. 94
6.2 Timing diagram for a two time slot model. 96
6.3 Signal ﬁow graph of an 8-point FFT. 99
6.4 Bandwidth comparison as a function of the

average number of FFT operations. 103
6.5 Bandwidth comparison as a function of PE

execution time for the FFI‘ computation. 105
6.6 Bandwidth as a function of multiplication

time for the digital ﬁltering computation. 108

vii

CHAPTER 1

INTRODUCTION

In many scientiﬁc and engineering computer application areas, the computer's computational
speed limits its range of usefulness. Examples of such areas include energy and power model-
ing, weather modeling and forecasting, ﬂuid dynamics studies, computer-assisted tomography,
and artiﬁcial intelligence.1 Several parallel approaches are being taken to extend the useful
range of computers. The ﬁrst approach involves reducing the switching delay times of elemen-
tary gates, which embraces both the areas of device physics and integrated circuit (IC) technol-
ogy.2 With the second approach, the switching delays are assumed to be ﬁxed but improved
methods are sought for performing primitive single-operand and double-operand arithmetic
operations, such as transcendental functions, integer arithmetic, and ﬂoating-point arithmetic.3
The third approach assumes that the primitive arithmetic operations are given; instead, the
focus is on the design of computer architectures that overcome the intrinsic speed limitations of
the von Neumann machine.4 The ﬁrst approach is inherently limited in that the effects of
reduced IC dimensions and the introduction of new logic families is expected to reduce gate
delays and, consequently, improve overall computer system performance by only a couple of
orders of magnitude.1’5 While this represents an important improvement, it does not in itself
achieve the long-range improvements needed. Gains with the second approach will result
largely due to decreased cost per bit and increased density per bit of semiconductor memories
and logic arrays. Older algorithms for primitive arithmetic Operations will be modiﬁed to fully
exploit speed improvements possible with ROM look-up tables or the advantages of performing
concurrent conditional computations, such as conditional sum addition.3’6 While the third
approach is ultimately limited by gate delays and the speed of primitive arithmetic algorithms, it
provides the potential for making the largest increase in computational speed. Here, multipro-

cessors are employed; and these multiprocessor architectures exploit the inherent parallelism in

speciﬁc tasks, thereby eliminating the single processor bottleneck--'von Neumann bottleneck."4

We will restrict ourselves to this third approach. Moreover, we will restrict our attention to a
class of multiprocessor architectures that can be implemented on a single very large scale

integration (VLSI) chip or a small number of such interconnected chips.

Historically, high-performance multiprocessor computer systems have been very expensive,
special purpose, and generally research-oriented tools. V1.81 technology is changing this so as
to make it feasible to build high-performance multiprocessor structures as low-cost, single-chip
system components. VLSI technology is a statement about system complexity, not about
transistor size or circuit performance. VLSI deﬁnes a technology capable of creating systems so

2 From this

complicated that coping with the raw complexity overwhelms all other diﬂiculties.
deﬁnition, we can see that the way in which the computer industry designs multiprocessor com-
puters in VLSI technology must, in fact, be different from the way it has traditionally designed
computers in other technologies. For example, in VLSI technology the transistors will be
almost 'free' and the interconnection data paths - communication - will determine the cost in

2 This is true because the interconnection paths in VLSI tech-

both area and speed of the chip.
nology are the same width as a transistor, which means that these paths reduce the available
active chip area. So, for VLSI implementation of multiprocessor structures simple and regular
underlying communication geometry is required to reduce the total amount of interconnection
path lengths on the chip. In addition, the processors should be identical to reduce the layout
time and effort of the architecture, the design should be partitionable into segments of manage-
able size and these designs should have a wide range of use to justify the costs for a component
manufacturer. As a result, this work is based on the following premise: Continued advances in
integrated circuit fabrication technology will permit chip complexities to increase more than
three orders of magnitude over what they were at the end of 1979.7 This research investigates
an alternative single-chip microprocessor architecture that exploits this technology to improve

signiﬁcantly the computational capabilities and general usefulness of small computer-based sys-

tems suited for signal processing computations such as waveform generation, modulation and

ﬁltering. Speciﬁcally, an alternative multiprocessor structure, called time Division Multiple Pro-

cessing (TDMP), is investigated and has the following general properities:

— greater bandwidth than comparable single-processor architectures;

— less complex switching network than a crossbar switch interconnection network;
— more ﬂexible than ﬁxed-array network;

— f all-access and nonoblocking interconnection capabilities;

— simply extended to pipeline operation;

— highly compatible with data ﬂow algorithms;

— amenable to VLSI implementation.

The TDMP structure is evaluated by comparing its performance to a single-processor archi-
tecture as one boundary of performance and to a ﬁxed-array-processor architecture as the other
boundary of performance. Bandwidth, hardware complexity, ﬂexibility, and regularity are the

four principal ﬁgures of merit.

Chapter 2 of this thesis contains a brief review of several key existing multiprocessor struc-
tures, including their performance and range of usefulness. In Chapter 3, the organization and
operation of TDMP is presented, along with an analysis of hardware complexity. Chapter 4
presents timing diagrams, closed-form expressions for the maximum multiplexing and demulti-
plexing rates and the maximum switching delays. Chapter 5 describes the TDMP simulation
model. In Chapter 6, the simulation results of two applications are given. The ﬁrst is a
fast-Fourier-transform and the second involves a digital ﬁltering computation. We also com-
pare the computational bandwidth among the TDMP system, single-processor system and
ﬁxed-array-processor system for these two applications. Finally, Chapter 7 gives a summary of

this research as well as some suggestions for future research.

CHAPTER 11

BACKGROUND

2.! von Neumann Bottleneck

Single-processor computer systems have intrinsic speed limitations that have been
ascribed to the 'von Neumann bottleneck." The term 'von Neumann bottleneck" was coined by
Backus-I to represent the word-at-a-time style of processing that is characteristic of von Neu-
mann machines--the model computer conceived by von Neumann and others about 35 years
ago.7’8 The von Neumann computer is composed ideally of a central processing unit (CPU), a
memory that contains data and instructions, and a connecting channel that can transmit a single
word at a time between the CPU and the memory (and send an address to memory). The con-
necting channel is where the von Neumann bottleneck occurs. The reason for its name is that
all computational tasks in the von Neumann machine must be accomplished entirely by pump-
ing single words back and forth through this connecting channel. A large part of the traﬂic in
the bottleneck is not useful data but merely names of data, as well as operations and data used
only to compute such names. As a result, single-processor bandwidth, deﬁned as the maximum
throughput measured in terms of the maximum number of results that can be generated per
unit time, is always limited by the von Neumann bottleneck as the computational operations get

suﬂiciently larger or complex.

An example will serve to illustrate the issue. In digital ﬁltering, a group of operations is
performed once for each sample (in time) of the signal being processed. For purposes of this
calculation, assume that the processor is a 16-bit microprocessor with a 200 nsec cycle time and
the ﬂoating-point multiplication, division, addition and subtraction operations are performed in
a hardware coprocessor.9’lo Single-precision, ﬂoating-point addition and subtraction are per-

formed in 14 nsec and 18 nsec respectively, and double-precision extended multiply and divide

operations are performed in 27 nsec and 39 nsec, respectively. If the second order recursive

ﬁlter

Y(2) _ A0+A.z"'

”(2) - Km 1 —a.:" — 15:22“2

 

 

(1)

is implemented with this processor as a diﬂ'erence equation of order 2 with constant coefﬁcients

Y(n)=AoX(n)+A,X(n—1)+B,Y(n-1)+BzY(n-2), (2)

then the maximum ﬁlter bandwidth is limited to approximately 3.3 kHz. This bandwidth is
ﬁxed by the minimum time it takes the processor to perform the group of operations for each

sample of the signal.

The group of operations consists of one-at-a-time execution of four multiplication and three
addition operations. For simplicity, we assume it takes zero time to move data and to fetch

instructions from memory. In this case, the minimum execution time is,

Tm,“ = 41” + 3:,4 - 4(27 nsec) + 3(14 nsec) = 150 nsec

where t” and 1,, are the ﬂoating-point multiplication and addition times, respectively. There-
fore, for every Tum, units of time a group of operations is completed, this corresponds to a pro-
cessor bandwidth of (Twin)"' - 6.6 kHz and a maximum ﬁlter bandwidth of
(213m)" - 3.3 kHz. If the order of the ﬁlter gets higher, the number of group operations
incrcascs; consequently, the processor and maximum ﬁlter bandwidth decreases. The shrinking
maximum ﬁlter bandwidth is caused by the von Neumann bottleneck and limits the processor’s
usefulness in many ﬁltering applications, such as those found in telecommunication systems,

where the maximum ﬁlter bandwidth requirement is 4 kHz and the ﬁlter order is 5.11

The von Neumann bottleneck can be eliminated by exploiting the inherent parallelism in a
group of operations. For example, if the group! of operations for the second order difference
equation in our previous example is implemented as shown in Figure 2.1, the processor
bandwidth is signiﬁcantly improved. Three successive independent stages of computations are
executed in parallel and the output of one stage feeds the next, analogous to an industrial
assembly line. If we assume a continuous input stream of values, zero time to move data
between stages and ignore any start-up times, the minimum execution time,

TL“, - maxlt,,r2,13} - speed of the slowest stage in the system, where

r. = r”, I; - IA and, t3 = (,4. Processor bandwidth is -tl- - 37 kHz because every Tim = r”,
u

a result can leave the system. This determines a maximum ﬁlter bandwidth of

BW- 1. - l -18.5kHz.

2Tmin 2’”

 

A direct comparison shows that 5.67:“,n - Tum; hence, the non-von Neumann style processor
has a ﬁve-fold processor and maximum ﬁlter bandwidth improvement over the von Neumann
style processor. This improvement was achieved by eliminating the one-operation-at-a-time

style of processing through a single channel--the 'von Neumann bottleneck.”

Recognizing that the von Neumann bottleneck limits processor bandwidth, we must develop
processor systems that compute larger units of the task at hand. To accomplish this, multipro-

cessing concepts are employed in new processor architectures to improve bandwidth.

Two widely used forms of multiprocessing are parallel processing and pipeline processing.
Parallel processing improves processor bandwidth by using many processors operating in paral-
lel, either on diﬂ'erent data sets or on diﬂ'erent portions of the same data set.12’l3 Since, in
fact, the processors are not always independent, they may require access to the same data or

interchange of results between processorsm’15 Flexible interconnection networks are needed

X(n) Y(n-1) Y(n-2)

 

 

 

 

 

 

 

   

 

 

 

 

 

 

 

 

 

 

 

Stage 1
Stage 2 + +
Stage 3 +
' '— Y(n-1 ) Y(n-2)
#- DELAY DELAY
V
Y(n)

Floating-Point multiply unit

Floating Point add unit

lLlH

DELAY Register unit

 

 

Fig. 2.1 Alternative implementation of the 2nd order
difference equation -- non-von Neumann style

processor.

here to efﬁciently handle the communication problems. By ﬂexible interconnection networks,
we mean networks that can change their data ﬂow pattern under program control. It turns out
that most applications are sped up by transforming their traditional sequential algorithms to use
the multiprocessor structure, not by executing on a processor with ever shorter memory access

times. ‘6

Pipeline processing, on the other hand, improves processor bandwidth by subdividing the
processing to be done into sequential functions and then assigning a processor to each function.
These functions are then arranged in a 'pipeline" so that the output of one stage feeds the
next.13 ’17’18 The processor bandwidth of this approach is limited by the speed of the slowest
processor in the pipeline if ﬁxed interconnection networks are used to feed the pipeline stages.
By a ﬁxed interconnection network, we mean a network whose data ﬂow pattern cannot be
altered. Jump presents quantitative techniques for the evaluation and comparison of digital sys-

tem pipelines.l7

2.2 Multiprocessing Survey

In this section, we brieﬂy review several key multiprocessor structures in order to
ideptify their multiprocessing attributes. We also suggest how well suited these attributes are
for VLSI implementation for enhanced computation. Unless dictated by the need for under-
standability, we have avoided material not directly related to the multiprocessing aspects of

these systems.

2.2.1 llliac IV. llliac IV is a single-instruction stream multiple-data stream (SIMD) experi-
mental computer designed at the University of Illinois in the late 1960‘s.19 The general struc-
ture of llliac IV is shown in Figure 2.2. It contains 64 identical processing units (PU) with a
common external control unit (CU), a four nearest-neighbor interconnection structure, an
interface to a supervisory host computer and a switching network for interchanging data and

instructions between the CU, host computer and memory. A PU comprises a processing

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

I P-ih
:- .-:L_Q" . -——
I
, __
L" "1 ”'1 l4—
. ‘ —i
i
t 'r---1 ‘---
-> PE F‘“ "—"
r f .L_£_H—_'_"2_h——
.——. CONTROL ...4
mm .
I
. ‘ ‘ I
SHITCH I
l TI'TI‘ I
' t ,
supv. L... p563 —-'i p” ‘-~
conpurra ~ MEMORY , n—1 53 ___‘

 

 

 

 

 

Fig. 2.2 A block diagram of the structure of ILLIAC IV.

10

element (PE) and a processing memory (PM). A PE is a general-purpose arithmetic logic unit
(ALU) capable of executing a conventional instruction set that includes 64-bit ﬂoating-point
operations. Each PM has a capacity of 2 k, 64-bit words.”’19 The control unit fetches instruc-
tions from a processing element’s memory, decodes them, and issues control pulses to the pro-
cessing elements 'for execution. It broadcasts memory addresses and data words when they are
common to all processors. An instruction can be either a control or a processing unit instruc-
tion. The former directs operations local to the control unit, whereas the latter controls the
execution of the processing units. The control is designed to overlap the executions of the two

different instruction types.4

In addition to the common data and control buses that link the PUs to the CU, there are
direct data paths connecting each PU to four neighboring PUs. Speciﬁcally, PU,- is connected to
PU} if j -= i +1 (modulo 64), j - i- l (modulo 64), j - i + 8 (modulo 64) or j = i - 8
(modulo 64). The PUs form a two-dimensional array, Figure 2.3. For this reason, Illiac IV is

often referred to as an array processor.4’l3'20

The array organization is very eﬂective in exploiting parallelism when the characteristics of
the problem to be solved match the physical structure of the array."12 Matrix operations pro-
vide an example of this kind of problem.12 The uniform processors and simple, regular com-
munication paths satisfy VLSI implementation requirement.21 A disadvantage of the array
organization is the inﬂexibility in the interconnection structure. This results in performance
degradation when the problem structure does not match the physical structure of the array and
reduces the universal appeal which brings about some high pressure constraints for VLSI imple-
mentation.4’12’22 The failure of a single processing element can hamper the operation of the
entire system; however, by adding alternate data paths in each processor, similar to a Chordal

Ring interconnection network, the system would have a graceful degradation.23

2.2.2 Cramp. Cmmp is a multiple-instruction stream multiple-data stream (MIMD)

11

 

 

«-
Wsa --

PU

if

 

 

 

 

 

‘-

 

 

 

 

 

 

pu ‘- G-l
55 -u- p”56 — P”57 p”58 .. p”63 PUo

 

 

Fig. 2.3 A block diagram of the data paths between processing
units in ILLIAC IV.

12

multiprocessor computer designed and built at Carnegie-Mellon University for computer archi-
tecture and artiﬁcial intelligence research work.14 The hardware and software were designed
with the goals of symmetry and general purpose in mind. By symmetry, we mean that repli-
cated components, such as processors, are treated as an anonymous pool; no one of them is
special in any sense. By general purpose, we mean the multiprocessor character of the machine
is used to improve throughput across a set of independent jobs as well as to multiprocess single
jobs.16 The system consists of 16 PDP—ll minicomputers connected to each other through an
interprocessor bus and individually connected to 16 memory modules through a full-access,
non-blocking, crossbar switching network. Each minicomputer operates like an independent
processing system with its own primary memory and controllers. The interprocessor bus allows
any processor to generate an interrupt to any subset of the processor conﬁguration at any of
several priority levels. No data is carried by this bus. The crossbar switch allows any processor
to establish a path to any memory module--full access and all permutations of individual pro-
cessors connected to individual memory modules are possible-~non-blocking. The switch is
under both processor and manual control. Collectively, the 16 processors execute six-million
instructions per second; the total memory bandwidth is about 500 million bits per second.
Despite the fact that Cmmp is built from minicomputers, it is a large-scale machine. Figure 2.4
shows'the basic structure of Cmmp along with other system components. The other system
components include another crossbar switching network to allow any processor to communicate
with any of the various controllers which manage secondary memories, and input/output dev-

ices.

The basic design goals of Cmmp were achieved, and the research revealed some important
observations about multiprocessor systems. Speciﬁcally, that the raw speed in the design of the
switching network and the processor is not very important but reliability of the system is very
important.16 The crossbar switching network provides full-access and non-blocking interconnec-

tion capabilities for processor communication. Some criticisms of the system are: the hardware

13

16 x 16 Cross Bar Switch

r--------------- ----- -1

 

 

 

 

 

 

 

 

 

 

 

l—l . '
Mg" 1‘\ .1'\ 1", '
,_1__ . ‘l’ ‘l’ ‘1’ .
I 2 I
t ' l
39‘ 'sl Ax AK '
14 I VI \1 \J l
..._..J I 1
""'” I I
”9‘ rx {x 15
15 I‘\J ‘NJ‘ 1 I
.___l 1 H, I
HEM ' Xx [x rx '
16 ' \J V1 ‘H '
L a - - - - - ............... J
L Li.
H .‘IEN EM
p HAP 1A?
F'_ ‘ F“

 

 

 

 

 

 

 

 

1r
1

 

 

 

 

 

 

 

 

ICPI

RC

43361) was]

 

 

 

 

 

l;] Cross-Bar-Switch
K

Input/Output Controller
Minicomputer (POP-ll)
Memory

Interrupt control

 

 

 

PE

3
III.

KC

Fig. 2.4 A block diagram of the CMU multiminiprocessor (Cmmp)
architecture.

14

is less reliable than desired, they were unable to partition Cmmp into disjointed systems, and
there is not enough human engineering into the software interface to the user.16 The N2
growth of the crossbar networks make this architecture a poor candidate for VLSI implementa-
tion. The crossbar connecting paths and switches use substantial amounts of the chip area,

thereby reducing the size of a network that can be implemented on a given chip.24

2.2.3 PM4. PM4 is a reconﬁgurable multiprocessor system for pattern recognition and
image processing research work and is currently under development at the Advanced Automa-
tion Research Laboratory of Purdue University.15 The system envisioned consists of hundreds
of Large Scale Integration (L81) bit-slice microprocessors and a three-level hierarchical memory

connected by a set of interconnection networks.15’25

Figure 2.5 shows a basic block diagram of PM4. The system consists of N identical proces-
sors with local memory (PMU), K identical vector control units with local memory (VCU),
shared memory connected to the processors through a delta interconnection network (PMIN),
ﬁle memory connected to processors by a shared bus and a yet undesigned interprocessor con-

nection network which permutes data among processors (IPCN).

The system can reconﬁgure its resources under system control to assume four different

operation modes:

1. SIMD MODE - Single-Instruction Steam and Multiple-Data Stream (SIMD), Illiac IV.19 In
this mode, the same instruction is executed by a subset of the processors operating on
diﬂ'erent data streams. This mode is used for vector operations with the vector control

unit.

2. Multiple SIMD Mode - In this mode, a multiple number of SIMD operations are executed

in parallel.

3. MIMD Mode - Multiple-Instruction Stream and Multiple-Data Stream (MIMD), Cmmp.M

15

 

L»

------J------

 

H Bus

--------1

 

 

------J

Delta Network
Processor-Memory Interconnection
Network (PMIN)

      
   

S

  

ML" so

      
  

Shared
Memory SH

   

L-l,H-l

    
  

File Memory Control Unit

      

File
"Emory

 

 
 
 
 
 

 
 
 

Fig. 2.5 A bIOck diagram of the PM4 architecture.

16

The processors perform independent tasks on separate data streams concurrently.

4. Distributive Mixed Mode - In this mode, SIMD vector instructions and parallel MIMD

processes are simultaneously executed.

The Vector Control Units (VCU) are used in the SIMD mode of operation. The VCU
broadcasts instructions on the vector control bus to all processors that are assigned to the SIMD
process. The VCU may also send control signals to a time-shared Interprocessor Communica-
tions Network (IPCN) to switch the data in a group of processors. The IPCN is time-shared
among VCUs during multiple SIMD processes. The VCU has the ability to mask or disable
processors so that only the active or unmasked processors execute the broadcasted instructions.
During, execution, a multiplexor is used to route broadcasted instructions from a VCU to a pro-
cessor. A delta interconnection network is used to connect the processors to shared memory
26,27

for block transfer of information.

PM4 architectural features are based on existing machines such as Cmmp and llliac IV.”’19

The architectural advantages of the above systems have been incorporated into PM4. The ﬂexi-
bility in the PM4 architecture allows it to overcome some of the architectural disadvantages in
existing MIMD and SIMD machines. However, because of the processor speed requirements
for swift reconﬁguration and communication geometry (three different interconnection net-
works), a single or couple chip V151 implementation of PM4 may not be applicable. Micropro-
cessors such as Motorola 6800, 15111, Intel 8086, and 28000 do not meet PM4’s speed
requirements and neither will V151 processors, since they are likely to be of only moderate
speedw’28 V151 performance measures indicate that chip area requirements for delta networks
are the same as those for crossbar networks.24 The diﬂiculties of V151 implementation of

crossbar networks were discussed in Section 2.2.2.

2.2.4 Indirect binary real». The indirect binary n-cube microprocessor array is a multipro-

cessor architecture in which processors are interconnected by a switching network whose set of

17

connections can be described by the set of edges of the binary n-cube.28 It is called indirect
because the array is not actually interconnected according to the topology of the binary n-cube.
The basic form of the array is illustrated in Figure 2.6 for n - 4, N '- 2‘ = 16. The circles in
Figure 2.6 represent the microprocessors, indexed from 0 to 15 as indicated by the numbers in
the circles. The lines on the right from the switching network connect back to the microproces-
sors with the indices given in parentheses. Each switch node, indicated by the squares in Fig-
ure 2.6, has two input lines, two output lines and can be put into either of the two states shown
in Figure 2.7, providing a “direct“ or a "crossed” connection. Each level of switching represents

a dimension in the n-cube; in this case, n - 4.

The network is used to permute data among microprocessors. In some cases, it may be
necessary to make multiple passes through the network to obtain permutations of the data that
are otherwise unrealimble. Multiple passes will, of course, entail a sacriﬁce of speed, but give
added ﬂexibility. In addition, as the n-cube dimensions get larger, the number of switching lev-

els in the array increase, which also sacriﬁces speed in the system.

The system has a two-level control system for the microprocessors, based on variable
microprogramming stored within the microprocessors. This means that global commands sent
by the main control unit to the microprocessors may be interpreted diﬂ‘erently by each
microprocessor. For the switch nodes a set of switch controllers are used to control a set of
switch nodes. These switch controllers receive global commands from the main control unit.

The control system is shown in block diagram form in Figure 2.8.

This processing array can be used effectively for a broad range of SIMD applications. The
regularity and modularity of its structure makes it an attractive candidate for V151 implementa-
tion. However, the range of application of the array is limited by the structure of the binary
n-cube. The binary n-cube does not have full-access and non-blocking capabilities.28 In addi-

tion, the complexity of implementing indirect binary n-cube networks in V151 technology is the

18

 

 

\/ l [

 

 

Level 3 Level 4
SN 5n
(0.21 10.3)

 

 

9:11 a

 

 

 

 

 

 

r
.le

 

 

 

 

 

 

 

A
U!
o

O

 

 

A
as
O

O

 

,L/m—

 

 

 

 

 

 

 

 

:J/n

 

 

a: as arl E? as a?
>4 _ ><><

 

 

 

 

Fig. 2.6 A block diagram of the indirect binary n-cube

array architecture.

 

 

 

 

 

 

\
us b
‘—’ \J

19

 

81— —l—ao

 

 

 

 

 

 

 

bi — — l_ ho
(a) direct

ai _—‘ I_ 30

bi -— ‘_ b0
(b) cross

Fig. 2.7 A block diagram of an individual switch node in
the indirect binary n-cube array.

20

 

l Controller l

i j J

SW3 tCh e o 0 Switch
Contro1ler Controller

 

 

‘ —

 

 

 

 

 

 

 

 

 

0
0

Fig. 2.8 A block diagram of the Indirect binary n-cube
array control system.

21

same as that for crossbar networks.24

2.2.5 STAR-100. The Control Data STAR-100 computer is a high-performance pipeline
processor structured around a four-million byte, high-bandwidth memory.20’29 Instructions
specify operations on variable length data streams. A block diagram of the STAR-100
memory-pipeline data paths is shown in Figure 2.9. The core memory and the data bus
conﬁguration have been designed to support a pipeline rate of 100 million 32-bit ﬂoating-point
operations per second. The core memory has 32 interleaved banks, with each bank containing
2 k 512-bit words. This memory system can support 512 bits of data per minor cycle and there

are 32 minor cycles.

The width of the memory data bus for each group of four banks is 128 bits. The data bus
transfer rate is 128 bits per minor cycle. Four buses are active with each bus transferring data
at a rate of 128 bits per minor cycle. Two of the buses are used for transferring operand
streams to the pipeline processor. The third bus is used for storing the resulting stream ele-
ments and the fourth bus is shared between input/output storage segments and control vector
references. The read and write buffers are used to synchronize the four active buses. The
memory segments are buﬂ’ered to space them eight banks apart, eliminating memory conﬂict

situations.

The ﬂoating-point arithmetic section of the ST AR-100 consists of two independent pipeline
processors. Processor] consists of a pipeline ﬂoating-point addition unit and a pipelined
ﬂoating-point multiplication unit. Processor 2 consists of a pipelined ﬂoating-point addition
unit, a non-pipelined ﬂoating-point divide unit and a pipelined multipurpose unit which is capa-

ble of performing a ﬂoating-point multiplication, divide, or square root Operation.

The memory interleaving and pipeline processing makes STAR-100 very efﬁcient for pro-
cessing vectors. Furthermore, its overall system design allows scalar processing as well. How-

ever, with respect to ﬂexibility, the architecture is non-ﬂexible and applications must be

22

       
   
  

r' """" ‘1
Pipeline
Processor

30

Memory Banks

 
 
   
 

128 Bits x 8 Banks
0-31 Containing 4M Bytes
Stack and

A.B.C.D ' 128-Bit Buses

Fig. 2.9 A block diagram of the Star-100 data paths.

23

structured around a vector for most eﬂicient performance. The long start-up times are seen as
a disadvantage and the communication network between the processors and memory

banks--wires--is seen as a disadvantage of V151 implementationz‘20

2.2.6 Cray-I. Cray-1 is a very high-speed function-oriented general purpose computer built
by Cray Research Incorporated and is capable of processing 80 million instructions per
second.20’30 Both scalar and vector processing capabilities are incorporated into its design. A
block diagram of the Cray-1 is shown in Figure 2.10. The main memory can be up to one mil-
lion 64-bit words of 50 nsec cycle-time bipolar memory. The memory is 16-way interleaved so
the CPU can easily achieve a data transfer bandwidth of one word per clock cycle. The 1/0 sys-
tem consists exclusively of channel connections to other computers and channel connections to

high-speed permanently mounted disks.

Cray-1 is designed to extend the independent functional unit concepts developed in early
CDC 6000 and 7000 series equipment.20 While the system avoids some setup problems found
in the STAR-100, its architecture suffers from the ﬁxed vector length. Vector chaining is used
to circumvent the ﬁxed vector length problem.30 However, the machine will not run eﬂiciently
if full task switching is done very frequently and special designs for each functional unit are not

conducive to a single chip V151 implementation.2’3’o

2.2.7 Systolic arrays. Systolic arrays are special-purpose, high-performance multiprocessor
devices. A systolic system is described by a network of processors which rhythmically compute
and pass data through the system. Every processor regularly pumps data in and out, each time
step performing some short computation, so that a regular ﬂow of data is kept up in the net-

work.2 1

Many basic matrix computations can be pipelined on systolic arrays composed of many inter-
connected inner product step processors. Frgure 2.11 shows a hex connected systolic array for

matrix multiplication. An inner product step processor is a processor that performs the

24.

 

 

 

 

 

Program Control Section I’
Program Counter ’4] ’
I
I
I
n I
(4 Parallel /’ Memory and 1/0 Section
Paths)

 

 

 

InterrUP .lnétgugtignjuifﬁri

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

- - - - - ’ I_ _
Functional Units' Register Section H {/0 in.
[_yector Length I ‘ ‘ .
I Memory .
l I 0
l K
_ l ,
I, ‘nteoer Add : A . p ' A Registers Inte.rupts
M 1
[_ integer u t ' Address
Buffer Re isters
L Pop Count ﬁl—H __.g.. T
‘ ir—
. Address
. ' Registers .
Real Time Clock ‘r— '
Integer Add I ‘ T ‘ I
L. .Shifr : l
I Locrcal ' S I
[ Pop Count V1.4— r Buffé'

 

 

 

 

 

 

Multiply Vector Ma5k
Recaprocal
ApprOximation
Unit

 

 

 

 

 

 

[ Integer Add
1_. soft

I Lgoical ‘

 

 

 

 

 

 

 

 

 

Sﬁalg
' 1. e9 sters .1
._.E12%§12§,599..._ if ca ar
eating

I

I

I

hit-

I

I

I

I

I

I

—-----i--

 

W
E

Vector Registers

Fig. 2.10 A functional block diagram of the Cray-1 architecture.

25

 

Fig. 2.11 The hex connected systolic array for matrix multiplication.

26

computation, RC - RC + R4 1: R. and makes the input values for RA and R, together with
the new value of RC available as outputs. Kung gives detailed examples of matrix computa-

tions on systolic arrays.2|

The systolic computations are characterized by the strong emphasis upon data movement,
pipelining in particular. The arrays have simple and regular communication paths and almost all
processors are identical. This makes systolic arrays attractive for V151 implementation. How-
ever, the regular communication paths are obtained by sacriﬁcing ﬂexibility in the network.
The rigid data ﬂow paths are not reprogrammable and their structure depends on the computa-
tion problem. As a result, a unique array must be provided for each diﬂ’erent computation
problem and, if the array does not have universal appeal, then the use of such a V151 part in a

new architecture brings about some high pressure constraints.22

2.2.8 Other commercial multiprocessors. There are other multiprocessor systems besides
the ones we have reviewed: IBM 370/168, CDC Cyber I70, Honeywell Series 60 level 66,
Univac 1100 Model 80 and Burroughs B7700, to name a few. Each of these multiprocessing
systems has distinct advantages and unique constraints with implications for performance.
Some of these systems have architectural organizations similar to the previously reviewed sys-
tems. The Cyber 170, for example, embodies the principle of functional partitioning similar to
the Cray-l system. A survey which highlights some of the architectural strategies of the above

commercial multiprocessor systems is found in reference 31.
2.3 Observations

Our brief review of several key multiprocessing architectures provides us with a
basis for making some general comments about multiprocessing systems. One central issue in
their design relates to the networks over which the multiple processors communicate to other
processors or memory modules. Clearly, as the number of processors increases, the charac-

teristics of this communication network become critical to overall system performance, cost,

27

and reliability. For example, Cmmp uses crossbar switching networks for communication pur-
poses. An N-by-N crossbar switch has full-access and non-blocking interconnection capabilities.
However, the difﬁculty with crossbar is that network costs grow with N2. Given V151 perfor-

mance bounds, crossbar networks are infeasible for single-chip multiprocessor systems.

Pease’s indirect binary n-cube microprocessor array uses the binary n-cube switching net-
work and Purdue’s PM4 uses a delta switching network. Both of these networks have similar
complexity and cost. The number Of switch nodes in these systems grows with Mag; N; and, if
implemented in small scale integration ($81) or medium-scale-integration (MSI) technologies,
their network costs are cheaper than crossbar. But, if these networks are implemented in V151,
their network costs are comparable to crossbar.24 Binary n-cube and delta networks are block-
ing, which means under certain conditions messages going to diﬂ'erent output ports will require
use Of the same path between two switches. Since, under the assumed protocol, only one mes-
sage can hold a given path during message transmission, blocking will occur. This blocking
reduces the bandwidth and application range Of the networks and introduces added delays in the
system Operation. Other multiprocessor systems such as Illiac IV and systolic arrays have even
more restricted communications networks. Illiac IV uses a four-nearest-neighbor interconnec-
tion and systolic arrays have ﬁxed data path interconnections. This results in performance
degradation when the problem structure does not match the physical system structure and the

utility of such systems is limited by their speciﬁcity.

The question thus arises, what type of interconnection network can be placed on a V151
multiprocessor chip that will enhance cOmputation and prove the multiprocessor chip useful in
a large number Of applications? In Chapter 3, Time Division Multiple Processing (TDMP), we
suggest time division switching networks as an alternative solution to this problem.32

Time-division switching networks have full-access and non-blocking interconnection capabilities

like crossbar networks, use very few data paths, and the network costs are dominated by the

28

cost-per-bit of semiconductor memory. However, no single network is generally considered
'best' since the cost effectiveness of a particular design varies with such factors as the computa-
tional tasks for which it will be used, the desired speed of interprocessor data transfers, the
actual hardware implementation of the network, the number of processors in the system, and

the cost constraints on the construction.33

The processors used in a multiprocessor system may or may not be identical. The Cmmp,
Illiac IV and systolic arrays are examples of systems with identical processors. The Cray-1 (in
which the independent functional unit concepts are employed) is a system with diﬂ'erent proces-
sors. It is essential that the processors used in a V151 multiprocessing system be identical
modules organized in a simple, regular fashion with a minimum number of connecting paths
between modules because such geometry leads to high density and, more importantly, to modu-
lar design. The TDMP system adapts to these V151 requirements by using identical

processor-memory modules connected to time-division multiplexed buses.

Another issue in the design of multiprocessor systems is their performance. lnvariably, the
scaling in performance of a multiprocessing system is sublinear. That is, using an N-processor
multiprocessing system always yields less than N times the performance of the corresponding
single-processor based systems. This is true for both N-unit parallel and N-unit pipeline archi-

tectural organization of multiprocessor systems.

One the one hand, we have multiprocessor architectures, like Cmmp, that are very ﬂexible
and have a wide range of application. However, this ﬂexibility is at the expense of additional
software and hardware operating system overhead needed for communication and control. On
the other hand, we have multiprocessor architectures, like systolic arrays, that have very high
performance but limited range of application. The very high performance architectures have
low operating system overhead, but this low overhead is at the expense of ﬁxed interconnected

data paths between processing elements to enhance the computation for a speciﬁc problem. SO,

29

in our investigation we seek a balance between these two extremes with a multiprocessor archi-
tecture that has performance and ﬂexibility bounded by a general purpose architecture like
Cmmp, on one hand, and a non-ﬂexible special purpose architecture, like systolic array, on the
other. We also seek an architecture that allows a simple software interface to the user. Con-
ventional architectures speed up most applications by executing algorithms that have been
transformed to use the multiprocessor structure or by ﬁxing the transformed algorithm into the
interconnection wiring of the multiple processors. Both approaches have proved successful in
many ways; however, in every case permitting large amounts of parallel activity, it has proven
far more difficult to obtain parallelism in software than to provide it in hardware. In view of
the nature of parallel hardware systems and the practical difficulties of keeping them running at
full speed, we investigate an architecture that executes data ﬂow programs. Data flow programs
are seen as a natural reinterpretation of conventional programs with parallel execution in mind.
This approach abandons the classical instruction driven computing. The underlying problem
with most current attempts to use parallel hardware is that they are based on traditional con-
cepts of programming.36 These concepts in turn are based wholly on the serial von Neumann
computer design, with instructions executed one at a time. In particular, use of a program
counter remains obligatory. In instruction pipelines, no attempt is made to alter the basic von
Neumann model. In vector and array processors, one instruction may operate on many pieces
of data, but only one instruction executes at a time. In multiprocessors. many program
counters step thrOugh subprograms simultaneously presenting complex problems of communi-
cations such as memory conﬂict. The use of a program counter is inappropriate when programs
are intended for parallel execution. Efforts to develop a model of computation which can
eﬂ'ectively express parallelism have yielded a form of program representation known as data
ﬂow. Execution of a data ﬂow program is data-driven; that is, each instruction is enabled for
execution just when each required operand has been supplied by the execution of a predecessor

instruction. Dennis and Misunas, Gurd and Watson have done work on the design of

30

computers based on the data ﬂow concepts.“’39 The system we investigate is a new, simpler

implementation based on some of their work.

CHAPTER III

TIME-DIVISION MULTIPLE PROCESSING

In this chapter, we present the Time-Division Multiple Processing (TDMP) architecture and
examine its hardware complexity. It is called 'time-division' because the transmission and
switching of information among the multiple processors is done with time-division techniques;
i.e., time-division multiplexing and time-division switching. These techniques are useful in
meeting the interconnection wiring and pin constraints imposed by V151 design while enhanc-
ing overall arithmetic computation. The key to TDMP is the integrated transmission and
switching system that provides the communication channels among the processors. After
describing TDMP, we very brieﬂy discuss implementing it using V151 technology. The intent
here is to show that TDMP is very compatible with V151 implementation. Then we consider
TDMP as a data ﬂow processor and investigate how it executes programs expressed in data ﬂow

notation.
3.1 TDMP Architecture

The basic TDMP organization is illustrated in Figure 3.1. The components consist
of an N x M array of processing elements (PE), time-division switching networks (TDS), and a
control unit (CU). We will give a brief description of each component and its interrelationship

with the others.

3.1.] PE array. The basic structure of the N x M PE array is shown in Figure 3.2 for
N - 12, M - l. N represents the number of PEs that share a unique pair of two-way incom-
ing and outgoing time-division multiplexed (TDM) buses. The number in each PE box gives
the PE time slot position on these buses. M represents the number of unique pairs of incom-
ing and outgoing TDM buses in the array. The incoming buses are shown as two-way com-
munication lines to the left of individual PEs and the outgoing buses are shown as two-way

communication lines to the right of individual PEs. The remaining lines are control lines that

31

32

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

‘1 Br
Incoming Outgoing
Receive Port Receive Port
CONTROL UNIT

Outgoing Incoming
Transmit Port Transmit Port
Input Input

Outgoing Incoming
TDS TDS
Output Output
A0 B0
Incoming Outgoing
Receive Port Receive Port
N x M
PROCESSING ELEMENT ARRAY
Outgoing Incoming
Transmit Port Transmit Port

 

 

 

Fig. 3.1. Block diagram of the TDMP architecture.

33

.uwcs Houucoo cam xmuum mm aqu mo Emuwmam xoon N.m .wwm

 

 

 

 

 

 

 

 

 

 

 

man also? 9.2.85 4
25 3623.3 wad-50cm %
mm
III. NH
m
age:
Houucoo
e
c

 

 

 

 

 

 

 

 

 

 

 

 

 

 

mac uulocmuu wcuowuao

 

34

convert the PEs into time-division multiplexed circuits for transmitting and receiving informa-
tion packets consisting of ﬂag bits, switching addresses, destination addresses, operand tags,
operands and PE signaling information. All PEs operate independently, are identical (see Fig-
ure 3.3) and consist of a central processing unit (CPU), memory and a communications inter-
face logic unit (CILU). The CPU has arithmetic and logic facilities to execute instructions
stored in memory. However, these instructions are not enabled for execution until all data

operands have been received from predecessor PE operations.

The communications interface logic unit (CILU) interfaces the CPU to the incoming and
outgoing transmit and receive TDM buses, Figure 3.3. The incoming transmit and receive
TDM buses are used for communication purposes in receiving packets from other PEs. The
outgoing transmit and receive TDM buses are used for communication purposes in transmitting
packets to other PEs. Both communication arrangements are illustrated in Figure 3.4. Incom-
ing packets contain data needed for the next computation and outgoing packets contain results
of a previous computation. Each CPU independently coordinates its use of these buses through
its CILU. The CILU consists of an addressable information packet buffer connected to each
TDM bus and control logic with control inputs from the CPU and timing signal inputs from the
control unit. The CPU can read from buﬂ'ers connected to receive TDM buses and can write
into buffers connected to transmit TDM buses. The timing signals from the control unit are
used to convert these buﬂ'ers into time-division multiplexed circuits. As a time-division multi-
plexed circuit, each buﬂ'er is assigned a time slot in a frame that is regularly repeated. The con-
tents of the transmit buffers are multiplexed onto its respective transmit TDM bus during its
time slot and the contents on the receive TDM bus are demultiplexed into its respective receive
buffer during its time slot. The time slots given to all buffers for a particular PE are the same.
Since we have separate outgoing and incoming communication paths, PEs can simultaneously

transmit and receive information packets during the same time slot.

35

 

 

 

 

 

 

 

 

 

 

 

UT on IT IR

from P

CU Q
Flags I

M cpu CILU ‘——
Control *r--'---4I~
r-—~ H
I
Data Bus

 

 

 

 

 

 

0T - Outgoing transmit bus
OR - Outgoing receive bus
IT - Incoming transmit bus
IR - Incoming receive bus
CPU - Central processing unit
M - HeHIOry
CILU - Conmunications interface logic unit

Fig. 3.3 Block diagram of a TDMP processing element.

36

 

Nu.—

 

.msumq mcowumowaaaaoo mm waﬁowuao can wcaaooca mo OHemem q.m ouswwm

 

 

 

 

mace; _ x mason .3; mecca mecca .
a: ecu: e: no a: see: a co cam
. _ 388.. «38 . .8 . . _ 38.5. .33 — e. 38

 

37

3.1.2 Tine-division switching network. The time-division switching (TDS) networks are
used to permute the information packets among the PEs, Figure 3.1. There are two switching
networks in TDMP, an outgoing TDS and an incoming TDS. The outgoing TDS has inputs
from outgoing transmit TDM buses and has outputs connected to incoming receive TDM
buses. The incoming TDS has inputs from the incoming transmit TDM buses and has outputs
connected to outgoing receive TDM buses. Both TDS networks are identical and operate in
exactly the same manner. To switch packets between PBS, the packets from individual time
slots on one bus are placed in the same or diﬂ‘erent time slots on other buses. The interchang-
ing of time slots is essential to time-division switching.32 The operation of time slot interchang-
ing is illustrated in Figure 3.5 for a single time-division multiplexed bus with four time slots.
In this example, the input time slots 0 and 2 are interchanged; i.e., the contents on the input
TDM bus during time slot 0 are placed on the output TDM bus during time slot 2 and vice
versa. To switch information packets between buses, space-division switches are used to inter-
connect (permute) TDM buses for each time slot period. This is known as “time multiplex
switching."32 Figure 3.6 shows a TDS result in which both space and time switching occurs.

Time slot 0 of TDM bus 0 is interchanged with time slot 2 of TDM bus 1 and vice versa.

Figure 3.7 shows a block diagram of a single input/output TDS network. The architecture
of the switch is extremely simple, but surprisingly powerful and ﬂexible. The switch consists of
addressable input and output registers, dual central processing units (CPUA and CPU,). Each
of these CPUs has its own program memory (PM, and PMs), read/write modules, and
multiplexors/demultiplexors. The processors (called network processors) operate concurrently
and perform all the switching functions according to a stored program in each PM. All data in
read/write memory (RAM) and I/O registers except the input address register are words in a
common data memory (DM) and the only operation of each processor is to move a word from

34

some location in DM to another location in DM. A separate DM is associated with each pro-

cessor. A move operation takes exactly two memory cycles, the FROM address is read from

38

 

rsa rsz r31 rsol Time-§lot lrss rsz rsr rso

 

 

 

( 1 I 1 ﬁ 7,—
I_(Time Switch)
T51 - Time Slot 1 NOTE: PEO +1Paz
(1-1,2,3,4) PEZ +~PEO

Fig. 3.5 Example of time slot interchanging in the TDS.

39

 

 

 

 

 

 

 

 

153 152 rsr rso rss 752 151 750
J PE3 , 9:2 1 PEl 1 PEG T,ﬂ. D,v,s,°n 9:3 1 PE? 1 PE1 1 PE6 l
‘T "" '1 ' Sutton 1 I I
Bus 0 (Time ‘ space Bus 0
rss 752 rsr rso S'I‘C“) rss 152 151 150
_1 P27J P56 1 was J 954 9:7 I PEOJPES 1 PE4 ‘
n I m 1 —I I
Bus 1 Bus 1
Note: PEO - 9E6
pas - pro

Fig. 3.6 Example of both space and time switching in the TDS.

4O

.xuosuoc weazouﬁzw ceamﬁ>ﬁvtoawu unnu=o\u=ncﬁ mawcwm m mo Emhmmﬁv xOOHm n.m .mwm

 

 

scum—us:

    
   

a E SE .- L OIR UL ‘

< n:¢
eacace<

 

 

 

 

 

      

 

. _ < ammo:
zen n—unu
cut:
scene:
at: <2D
A
goo—a

 

 

      
 

     

 

 

   

ale...— x.l.e: one»... 5.:
_ .. ebeam-.ep
sub-c
=3: Oa_em
accuse
a. I I .4 . .
can can:
uaeunc
u:___e:u.m
mccaaac
nth
um—h
use
“any:
ui..=e
u:_k
0:0

mEm—cﬂx tits

usee_ « ear
I I. la
-

nu
_
.

 

.
xenon death <

e.e.m-_eh .

E
0-!

use: . . ¢e___-euun

. . vantage
. nah

n<3m
11.11 nmcuv—i

 

      

car can:
canes

 

a...
eu

uses:
¢¢~hcw

mxuhmuuns hnnlu

Itasca:

.eo—uneuuaon
. . uancu

. so...»

41

PM and is used to read at that address from DM, then a T0 address is read from the next loca-
tion in PM or from the input destination address register, A, which is the address where the
word is written in DM. In the TDS network, this move operation corresponds to putting the
input information packet data register contents into the CPU hold register and performing time
slot interchanging on this data by storing the data packet in a memory location in DM that
represents the same or diﬂerent time slot position. When the input destination address register
is used in the TO cycle of the move operation, the switching is called dynamic and the proces-
sor move instruction executed is called 'indirect move.“ When the PM address data is used in
the TO cycle of the move operation, the switching is called static and the processor move
instruction executed is called 'direct move.“ The operations and capabilities of the individual
network processors of this type architecture are developed by Lipovski and called the C-move
processor.34 However, the network processor differs slightly from the C-move processor in that
it allows an addressing mode for which the address in TO cycle of the move operation is taken
directly from a special address register and not PM. This change enhances the switching speed
of the TDS network. Each input packet is serviced sequentially according to its time slot posi-
tion. The processor completes its service cycle for each packet during one time slot. After one
frame, all data packets have been stored into DM. During the next frame, all of the stored
data packets are moved out sequentially from RAM locations in DM to the output register loca-
tion in DM. Since each processor must ﬁrst store all packets during one frame then read out all
packets in the next frame, two CPUs are needed so that when one CPU is inputting packets the
other CPU is outputting packets. CPUA uses DM,. for its input move operations; however. the
input registers are only associated with DM,. when CPU,. is inputting packets, otherwise these
registers are associated with DM, which is used by CPU, in its input move operations. Simi-
larly, the output registers are only associated with CPU,. and DM,. when CPU A is outputting
packets, otherwise they are associated with CPU, and DM, when CPU, is outputting packets.

A multiplexor and a demultiplexor are used to connect the input and output registers to a DM

42

by using framing information to make the correct association. We have two frames called A
and B. During frame A the input registers appear as memory locations for CPU, in DM, and
the output registers appear as memory locations for CPU, in DM,. During frame B the output
registers appear as memory locations for CPU, in DM, and the input registers appear as

memory locations for CPU, in DM,.

The execution of the processor program in both CPUs is in synchronization with the input
and output TDM information packet buses. The input is synchronized by using the input ﬂag
bit as a predicate for program execution, using the input switching address as the program
counter for PM, and using time slot sizes greater than or equal to the time it takes the network
processor to execute the instructions pointed to by this switching address. The output is syn-
chronized by using the time slot counter value as the program counter where the instructions

pointed to in PM sequentially output stored packets and ﬂag bits.

3.1.3 Control unit. The control unit (CU) interfaces with 1/0 or a host processor, initializes
the time-division switching networks, initializes the PEs, generates and sends timing pulses to
PEs. The initialization steps are done prior to the beginning of program execution. Initializa-
tion of the time-division switching network consists of storing in program memory the program
switching instructions associated with control and time slot interchanging and resetting the net-
work processor to begin execution. The initialization of PEs consists of resetting the CPU and
storing instructions, data constants, and initial values into each PE memory. The control unit
has direct access to the network processor’s program memories for program loading and pro-
gram modiﬁcation purposes. However, CU access to the PE memories is through the switching
network. The CU has outgoing transmit and receive TDM buses and incoming transmit and
receive TDM buses. These buses are connected to the TDS networks in the same fashion as
the PEs, Figure 3.1. However, the CU has a multiple number of time slots per frame for com-

munication purposes as compared to the PEs which have one time slot per frame. The CU can

43

use these time slots to send and receive information packets to and from individual PEs and to
send control packets to the TDS network processor. The control packets modify the network
processor program counter, which causes a jump to a subroutine that services that input control

message.
3.2 TDMP Implementation

There are several important attributes of TDMP that make it well-suited for VLSI
implementation. The ﬁrst attribute is that the majority of the processors are identical. This
produces a major beneﬁt in decreasing the layout time and effort of the architecture. The
decrease comes about because the use of identical structures reduces the total number of dev-
ices which must be individually drawn. In addition, the more structured layouts are easier to
validate. A second attribute is that the processors and memory are in close proximity and can
be implemented in the same technology. The locality of processing reduces the hardware and
software communication overhead. A third attribute is that the underlying communication
geometry, time-division multiplexed buses, is simple and regular which leads to high density
and, more importantly, to modular design. In this way large TDMP systems can be developed
as a collection of many simpler TDMP systems. A fourth attribute is that the switching net-
work is regular and has full-access and non-blocking capabilities with simple and efﬁcient con-
trol. The regularity in the switch comes about through the use of semiconductor memory for
switching the data. This use is compatible with V151 and future technologies, since these tech-
nologies will decrease both cost per bit and memory access time. The ﬂexibility in the architec-
ture brings about a universal appeal. As a result, V151 cost advantages can be obtained by the
wide variety of applications in which it can be used eﬂ'ectively. However, this is not to say that
TDMP is suitable for all applications. The last attribute is that overall architecture of TDMP is
highly compatible with algorithms that use a packet form of data movement such as data

ﬂow.35 In data ﬂow algorithms, there is no need to maintain long and continuous connections

44

between source and destination such as in block transfers of information. So in TDMP, pro-
cessing elements send information packets to other processing elements only during designated

time slot periods.
3 .3 TDMP: A Data Flow Processor

TDMP executes programs expressed in data ﬂow notation. These programs are nor-
mally described as program graphs which represent the data dependencies between operations.
The attractiveness of such a system lies in the fact that it is data-driven; that is, each instruc-
tion is enabled for execution just when the required operand(s) has been supplied by the execu-
tion of a predecessor instruction(s). Since data ﬂow instructions have no side eﬂ'ects, unrelated
instructions can be executed concurrently without interference if each has its required operands.
In this sense, the progress of a computation is determined by the passage of data through the
system. Principal advantages of the TDMP data ﬂow over conventional designs are reduced
complexity of the processor interconnection network, greater use of pipelining, and a simpler
representation and implementation of concurrent activity. To illustrate the basic concepts of
TDMP data ﬂow operation, consider the data ﬂow program shown in Figure 3.8. This program

represents the computation required for a second order recursive digital ﬁlter
I’(n) -= AX(n) + BX(n-1) + CY(n-1) + DY(n—2)

where X(n) and Y(n) denote input and output samples for time nT, where T - 1. In this
diagram, PE operators 2, 3, 4 and 5 are single-input operators that multiply by the ﬁxed param-
eters A, B, C and D; PE operators 6, 7 and 8 are two-input operators that perform addition;
and PE operator 9 is an identity operator that transmits its input values unchanged. Each small
solid dot is a link that receives results from an operator and distributes them to other operators
for use as Operands. Input operator 1 represents the outgoing transmit control unit port

through which an external stream of values that represent the input signal X(n) is presented to

 

 

‘H-Zlfi.

 

 

\l

X(-l)

.0

 

 

N .

Fig. 3.8 Data flow program graph for a 2nd order difference
equation.

46

the program. Similarly, output operator 10 represents the incoming receive control unit port at

which the sequence of values representing Y(n) is delivered during program execution.

The large solid dots (tokens) show the presence of values at certain input arcs of operators
and deﬁne the initial conﬁguration for program execution. An operator with tokens on each of
its input arcs and no tokens on its output arcs is enabled, and may ﬁre by removing the tokens
from its input arcs, computing a result using the values associated with the tokens, and associ-
ating the result with a token placed on the output arc of the operator. A link is enabled when a
token is present on its input are and no token is present on any of its output arcs. It ﬁres by
placing tokens on each of its output arcs and removing the tokens from its input are. The new
tokens distribute cOpies of the value associated with the input token over each output are of the
link. TDMP can perform conditional execution by using PEs to perform the primitive branch
operation. The branch operator selects one or two output arcs (destination addresses) on which
to place (send) its ﬁrst input data, according to the state of a secOnd Boolean input value. The
two possible ﬁring states lead to the execution sequences shown in Figure 3.9. The operator
can be used to achieve conditional expression evaluation at a higher level. Figure 3.10 shows a

natural translation of the high level conditional assignments:
ABS: =IfA20THENAE15E-A.

Conditional ﬂow graphs should be constructed with caution, since the absence of tokens ﬂowing
down some arcs might leave other tokens stranded at inputs to nodes. Conditional expressions,
such as the one in Figure 3.10, are 'safe" provided that both THEN and E15E expressions are
stated and are of the appropriate type. Conditional evaluation combined with cyclic or reentrant
ﬂow graphs proves TDMP extremely powerful. For example, an iterative or loop construct can
be implemented by conditionally deciding whether to send tokens to the next block in a pro-

gram, or to recycle them through the current block. However, problems can arise in using

47

X
BRANCH ::::> BRANCH
False
F T F T

X

BRANCH :> BRANCH

True

I F T

Fig. 3.9 Example of a data flow branch operation.

48

Initially A

 

 

 

 

Eventually
ABS(A)

Fig. 3.10 Example of higher level
conditional expression
evaluation.

49

some reentry ﬂow graphs because of blocking and token identiﬁcation. Gurd and Watson dis-
cuss these problems and sight several examples to illustrate the issue.36 Dennis ﬁrst proposed a
very basic version of a data ﬂow language in which instruction execution was limited only by
the data dependencies of the program.” Dennis and Misunas did preliminary work into the

design of a computer based on this language.35

Raumbaugh has expanded and improved the
earlier version of the data ﬂow language proposed by Dennis and has developed a multiproces-
sor architecture consisting of N identical activation processors.38 Each processor is capable of

executing in a pipeline manner several data ﬂow instructions at a time.

The TDMP data ﬂow processor is conceived as using each processing element as a combined
operator and link, or just a link. The time-division switching networks in TDMP are then used
as a means by which the link operations can be carried out. If a particular data item is to be
used concurrently in more than one place or time in the system, then that data item must be
explicitly sent to the multiple processing elements where it will be used. By using submulti-
plexing in both the PEs and switch, A frame and B frame, two items can be sent to diﬂerent
destinations. Submultiplexing is an important part of TDMP when transmitting to multiple
locations in order to prevent two PEs from simultaneously transmitting to the same PE during

the same frame.

Data items are transmitted in the form of an information packet. A PE information packet
contains ﬁve ﬁelds; ﬂag, switching address, destination address, tag, operand and signalling.
The switching address and destination address are used by the switching network to direct the
tag, operand, and signaling ﬁelds of each packet to the correct destination. The tag ﬁeld is used
by the PE to identify what operator in PE memory the data operand is associated with. By stor-
ing multiple operators in memory, task or program switching is performed more quickly. The
number of multiple operators is limited by the PE memory size. The signaling ﬁeld is used by

the PE to transmit and receive request and acknowledge signals when exchanging information

50

packets between PEs. Once placed on a data path, packets remain there until destroyed as a
consequence of being acknowledged by the destination PE. The packet is placed into a ﬁring
set of the PE using the tag ﬁeld. Any PE can execute any machine language program providing
that it has sufficient amount of storage. The machine language program deﬁnes the operation
performed by the PE; e.g., arithmetic or logical type operations. If the incoming packet data
does not make the receiving PE ﬁrable, then the packet data is stored in the appropriate place
in the PE memory. This situation occurs when the received packet does not completely satisfy
the needed operand requirements of the PE. For example, if the PE needs two Operands to be
ﬁrable and has previously received none, then the incoming packet will be stored until the
second operand has been received. If the incoming packet data makes the PE ﬁrable then the
PE ﬁres immediately, using the packet data as needed. When the PE produces a result, it for-

mats an outgoing information packet for each copy of the result that is needed.

CHAPTER 1V

TIMING CONSIDERATIONS

In Chapter 3 the PEs and control unit were described as time-division multiplexed
circuits when transmitting and receiving information packets. The timing relationships of the
control signals necessary to convert the PEs and control unit into time-division multiplexed cir-
cuits were not discussed in Chapter 3. In Section 4.1 of this Chapter, we describe these timing
relationships in detail with a timing diagram. In Section 4.2, we develop an expression that
bounds the maximum multiplexing and demultiplexing rates achievable in TDMP. The max-
imum multiplexing and demultiplexing rates ﬁx the minimum time slot size on the
time-division multiplexed buses. The minimum time slot size in turn determines the logic one
and logic zero time duration requirements of the timing control signals discussed in Section 4.1.
The TDMP switching network has an inherent delay to aﬂect switching of the information
packets. In Section 4.3, we present an expression that bounds the maximum switching delay

through the time-division switching (TDS) network in TDMP.
4.] Timing Diagrams

The timing diagram of the control signals that convert the control unit and each PE
into a time-division multiplexed circuit is shown in Figure 4.1. These control signals consist of
P, Q, TWD, RWD, A and B frame. In this diagram 16 time slots are created (0-15) by using
four Q signals (Q0, Q4, Q, and Q”) and four P signals (Po, P,, P, and P3). Each PE has one
Q signal and one P signal connection. The control unit has one P signal and four Q signal con-
nections. The simultaneous occurrence of a logic one on both the Q and P signals with logic
one on TWD for transmitting and logic one on RWD for receiving select the control unit or a
PE for multiplexing and demultiplexing information packets. TWD and RWD are clock signals

used to multiplex and demultiplex the information packets to and from the time-division

51

52

.Eouwmm mzna uon maﬁa 0H m pom mamcwwm Houucoo can maﬁawa ago .mHh

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

1FTCC EC1EFTCC1—LI
JL1FJIJITCCFTEEEF71C|F
11411:. .11.. E
FI1._ TIL 71L
r1L E _.11_1
1.. E FIIL F1
_ w h
—

 

 

3.3.:Ea:1.1.1eEa11%;.ages;

 

 

 

 

It.

"1

53

multiplexed buses. The 4x4 matrix shown in Figure 4.2 is an alternative representation of the
P and Q signals used for creating each time slot position. The numbers in each box represent
the time slot position created. In addition the time slot positionfs) for the control unit and
each PE can easily be determined by simply adding together the subscripts on the P and Q sig-
nals for that PE or control unit. For example, P3 and Q4 generate time slot 7, 3 + 4 -= 7. In
TDMP time slots 0, 4, 8 and 12 are given to the control unit and the remaining time slots are
used by the PEs. The length of time for which both Q and P are simultaneously logic one sets
the time slot size. This time slot size is a function of the maximum multiplexing and demulti-
plexing rates and will be discussed Section 4.2. The A and B frame control signals are used for
submultiplexing purposes. Separate registers are used to hold A and B frame information pack-
ets. When A frame is logic one only A frame information packets are multiplexed and demulti-
plexed and when B frame is logic one only B frame information packets are multiplexed and
demultiplexed. However, A and B frame cannot simultaneously be logic one. More levels of
submultiplexing can be employed in TDMP, however increasing the amount of submultiplexing

also increases the transmission delay of individual packets within the system.
4. 2 Maximum Multiplexing and Demultt'plexr’ng Rates

To determine the maximum TDMP multiplexing and demultiplexing rates we must
start at the Time Division Switching (TDS) network. The TDS network works on a sampling
basis. The transmitted information packets on the input of the TDS are sampled then stored in
random access memory (RAM). The time between information packet samples is called the
"sampling period” of the switch, T,. The minimum sampling period is bounded below by the
read/write cycle time, t,,, of the RAM plus the control overhead, t‘, for processing the infor-
mation packet, T, 2 t... + t,. The control overhead consists of the time for scanning the input
for new samples, testing the frame condition, determining the type of service requested and

reading the destination address. A ﬂow chart for the TDS control algorithm is shown in

54

 

 

 

 

Q0 Q4 Q3 Q12
r0 0 4 a 12
v1 1 s 9 13
92 2 6 1o 14
r, 3 7 11 15

 

 

 

 

 

 

Fig. 4.2 Matrix representation of the P and Q control
signals.

Note: The numbers in each box represent the time slot position
created from the P and Q control signals.

55

Figure 4.3. The minimum read/write cycle time, t,,, is ﬁxed by the current state of the art of
semiconductor memories and as the technology improves I," will decrease. The control over-
head, tc, will also decrease as technology improves, however further improvements can be
obtained by maintaining precise synchronization of the TDS with the input multiplexed buses
thereby eliminating the scanning of ﬂag and frame bits used for synchronization. In addition, if
we provide for only one type of switching service, then we can eliminate the control time
needed to determine the type of switching service requested. In this investigation we will take
the conservative path of keeping the extra control overhead since our primary purpose is to
determine the feasibility of using time-division techniques to enhance computation and further
enhancements can be made by the elimination of the extra contrOl overhead. As a result, the
switch sampling period puts a lower bound on the minimum PE time slot size T: Z T, where
T, is the time slot size and T, is the switch sampling period. The minimum time slot size ﬁxes

the maximum multiplexing and demultiplexing rates in the system,

1 l

< -
me— T, (min ) t," +tc

 

In our control algorithm for the time-division switching network used in TDMP I, = 8t“, Fig-

ure 4.3. Therefore in TDMP the maximum multiplexing rate is,

I
< .—
me — 9‘,"

4.3 TDS” Maximum Switching Delay

The maximum switching delay of the TDS in TDMP is a function of the time slot
size, the number of time slots per frame, and the process of time slot interchanging. In TDMP
16 time slots make up one frame. Each PE is assigned one time slot and the control unit is

assigned four time slots. These frames are regularly repeated and alternated via

.56

 

 

 

 

 

 

 

 

 

 

 

7 IwVE
Tsc To
PC

 

 

 

 

 

 

 

  

MOVE D}:
TO DO’L‘ T

 
    

 

 

 

 

 

Fig. 4.3 Flowchart for TDS control algorithm.

57

submultiplexing between A and B frames. As described earlier, in Chapter 3, the TDS network
switches information packets by providing for the association of input time slots with output
time slots. All input information packets are processed sequentially and stored into RAM, then
read out sequentially from RAM. This process of reordering the sequential input information
packets by storing them into RAM locations speciﬁed by destination addresses and then reading
out the RAM contents sequentially is called time slot interchanging. This technique implies a
time delay to aﬂ'ect a change in time slot position. The delay is the amount of time it takes the
TDS to store all the input information packets for one frame into RAM locations representing
time slots positions plus the time it takes the TDS to sequentially output all these stored infor-
mation packets. As a result, the maximum switching delay can be predicted by the following

expression,

T, - 21vrJr

where N is the number of time slots per frame, TX is the time slot size, and the 2 is a result of
the time slot interchanging process. In TDMP N - 16 and if we assume that the read/write
cycle time of RAM is 30ns then T, - 270ns and the maximum switching delay in the TDS

equals

To = (2)(16)(270ns)
TD - 8.64 #8

The TDS switching delays results in a maximum packet switching delay of 17.28 us. The
packet switching delay is twice the TDS switching delay because of the A and B frame submul-
tiplexing in which packets are transmitted every other frame. However, if the same informa-
tion packets are sent during both A and B frames then, the maximum packet switching delay

equals 8.64 us. If the switching control overhead is reduced to zero the time slot size

58

decreases, TX - 30ns, and the maximum switching delay becomes 7‘, - .96 us.

The maximum switching delay in the TDS network increases linearly with N.
TD " A IN

where N is the number of time slots on the TDS input and output time-division multiplexed
buses and A . is a weighting coeﬂicient of the TDS network. The coeﬂicient A. is determined
from the switching implementation, control algorithm, and any other factors that might
inﬂuence the switching speed of the TDS network. The other factors might include fault detec-
tion built with the hardware and diagnostic programs in software to improve reliability of the
network. Many of the popular multistage interconnection networks such as binary n-cubc,

banyan and shuffle exchange have a switching delay which increases logarithmically with N,
TB " 421082 N

where N in these networks represent the number of input and output ports of the network and
A2 is its weighting coefficient.40 The coefficient A; is determined from the same factors that
are used to determine A, in the TDS network. In comparing the switching delay of intercon-
nection networks the rate of increase, N or logzN, is often used as a measure of performance.
This is a useful measure when N is very large. However, for practical systems built with tech-
nology we can use in the immediate future, we must not only consider the rate of increase but
also the weighting coeﬂicients that multiply the rate of increase. If the weighting coefﬁcients
are not used in determining the maximum switching delay, then the conclusions drawn from
such as analysis might be wrong. As an example, if we compare the maximum switching delay
of TDS and the popular interconnection networks based only on the rate of increase, then we
would conclude that the popular networks have a maximum switching delay less than that of

the TDS network for all N .>. 1 since logzN < N for all N 2 1. However, in Figure 4.4 we

59

.osmmﬁ mmaov wcHJOuH3m zNwOH

Amuuoaano u=«:0u«anv z

~< mamuo> zH< OSu Oumuumsaaﬁ Ou mauamxm c< q.q .me
83 Hz 2: 2
u n n n n

 

 

 

cog

~2u<

A52:

3“ u

scrap Intonatns

60

show that the maximum switching delay of TDS is less than that of the popular networks for
some values of N 2 I, if A. < A1. This example points out that the issue in comparing max-
imum switching delay is not just the rate of increase, N versus logzN, but is A,N versus
AzlogzN. It has not been shown but is conceivable that A, << .4, since the pOpular type inter-
connection networks, referred to as logzN, may have to make multiple passes through the net-
work to reach the ﬁnal destination and messages may have to wait as a result of blocking in the
network. These factors inﬂuence the A; coeﬂicient but not A1,. However, the applicability or
superiority of one interconnection network over another should not be based only on switching
delay. For example, unless efficient algorithms which use these networks and simple operating
systems that manage their resources and supervise their processes are found, the discussion of
switching delay is moot because the system as a whole will not be cost-effective and will experi-
ence considerable amounts of down-time due to software and hardware failures thus reducing

the system throughput.

We feel that the TDS network offers some signiﬁcant advantages over the logzN intercon-
nection networks in algorithm mapping and operating system implementation. The mapping
advantages results from the non-blocking and full-access interconnection capabilities of the net-
work. Since all permutations are possible in TDS the programmer can construct or restructure
algorithms without concern about interconnection capabilities of the network. And because the
network is a C-move processor, many of the operating system functions can be directly imple-
mented into the network, eliminating much of the time and hardware needed in supervising the
switching process and in communicating and acting on exceptional conditions arising during
switching.

The TDS switching delay is tolerable in TDMP system if we consider that while some PEs
are transmitting and receiving information packets through the TDS network other PEs are per-

forming computations. As a result throughput i.e., the quantity of useful information pro-

61

cessed by the system per unit time, is a better measure of performance for TDMP than is the
switching delay of the TDS network in TDMP. And since TDMP is based on data ﬂow princi-
ples and the TDS network has full-access and non-blocking interconnection capabilities the
decomposition of programs is an easier task to perform in programming sufficient parallel

activity into software to keep the parallel hardware fully occupied.

It was shown in this chapter that each PE only needs two signals, P and Q, to convert it into
a time-division multiplexed circuit and that the minimum time slot size generated by these sig-
nals is determined by the maximum TDMP multiplexing and demultiplexing rates. The multi-
plexing and demultiplexing rates were found to be dependent upon the sampling period of the
TDS network. We also showed that the TDS network has an inherent switching delay that
increases linearly with the number of time slots, N. We presented arguments as to why the
rate of increase of switching delay in TDS and the popular logzN interconnection networks was
not sufficient to determine the superority of the logzN networks over the TDS network. In the

next chapter, we describe a simulation model of the TDMP architecture.

CHAPTER V

TDMP SIMULATION MODEL

A simulation approach was chosen to analyze TDMP because it provides a timely
and economical means of evaluating preliminary designs. Moreover, the versatility of this
approach means that a wide range of alternative designs can be realistically analyzed. The use
of a simulation model in our investigation has a two-fold purpose. The ﬁrst is to analyze
TDMP’s system performance, e.g., the switching capabilities and switching delay, the communi-
cation protocols between processing elements, and the ability to execute a single data ﬂow pro-
gram at a time. The second purpose is to provide a detailed language description of the TDMP
hardware that complements the less detailed block diagrams describing this hardware. As a

result, the simulation language in our investigation has the following characteristics:
— capability to simulate concurrent activity;

— capability to simulate timing:

— ﬂexibility to simulate hardware at both the gate-level and functional-level;

— easily translates hardware representation into a computer program.

Several simulation languages were available to us to choose from for our simulation model.
These languages were SPICE, ECAP, GPSS, and APL. SPICE and ECAP are circuit level
simulation languages and are extremely useful in modeling the dc, ac and transient eﬂects in
circuits.41 However, these languages are too low-level for our current simulation purposes
because they model hardware at the transistor-level. These languages would be more useful to
us when we have veriﬁed the correct operation of the system at the logic gate-level and are
ready to test the design performance at the transistor-level for integrated circuit implementa-

tion.

62

63

GPSS is a simulation programming language used to build computer models for discrete-
event simulations.42’43 GPSS is a process-oriented language and is particularly suited to the
translation from a ﬂow-chart representation of a system into a computer program. This is a
disadvantage for our simulation purposes because the computer program representation that
results is too high-level. We have a hardware representation of our system and would like the
computer program model to complement the less detailed block diagrams describing the
hardware as well as to simulate the processes in the system. GPSS would be more useful in

simulating the protocols in TDMP and not the hardware design.

APL, however, meets most of our desired simulation language needs.44‘45’46 APL has the
ability to perform both functional and gate-level simulation and the model implemented in this
language neatly complements the less detailed block diagrams describing the hardware. The
ability to perform both functional and gate-level simulations gives us the diversity to synthesize
some parts of the model at a high-level using functional modules and some parts at a low-level
using gates. Functional modules in the TDMP model tend to be those parts of the architecture
that are not germain to the TDMP operation, e.g., arithmetic logic units, registers, memory
storage units, etc. The gate-level models tend to be those parts of the TDMP architecture that
are unique to its operation, e.g, control units, interface logic units, multiplexors, demultiplex-

OI’S, etc.

All of these simulation languages have short coming when they must simulate concurrent
processing. These languages are based on sequential computation with a single processor, as a
result they cannot at all or with limited success model asynchronous events or concurrent
operations. In addition all of these languages are very costly and require fast, high capacity
computers. Given the advantages and disadvantages of each simulation language we chose APL

as the language in which to build our simulation model.

This chapter begins with a description of the APL model of the time-division switching net-

64

work, Section 5.1. Then in Section 5.2, the APL simulation model of the processing element is
described. And, ﬁnally, in Section 5.3, the operation of the complete APL simulation model of

TDMP is discussed.
5.1 TD? Simulation Model

The block diagram of the data paths for the time-division switching (TDS) network
simulation model is shown in Figure 5.1. There are two identical sets of data paths in the TDS
network, one for switching A frame information packets and one for switching B frame infor-
mation packets. As a result, only one set of these data paths are described in presenting the
TDS network simulation model. The bused data paths in the model consist of DBUS, OBUS,
IBUS, INBUS and OUTBUS. DBUS is the network processor data bus. All data moved to or
from memory, registers and hardware operational units is placed on DBUS. OBUS and IBUS
are internal use buses of the network processor. These buses connect the inputs and outputs of
the network processor internal use registers to DBUS. INBUS and OUTBUS respectively con-
nect the switch input and output port registers to DBUS. Many of the remaining data paths
consist of gated input and output connections to these buses from registers, memories, and
hardware operational units such as an adder, decrementer and incrementer. By gated connec-
tion we mean that a component must be selected in order to put data on a bus or take data oﬂ‘ a

bus. The gates are represented by the small circles on the data paths shown in Figure 5.1.

Speciﬁcally, INBUS has gated connections to the input and output of the input ﬂag (INF)
register, the outputs of the switching address (SWAD) register and input data (DIN) register.
The INF register signals a request for service to the switch during an input packet operation.
While the SWAD and DIN registers respectively contain the address of the type of service
requested and the input packet data to be switched. The OUTBUS has gated connections to the
input and output of the output ﬂag (OUTF) register, the input of the data out (DOUT) register

and the output of the time slot counter (TSC) register. The OUTF register signals the switch

ti:-
'—

.Haeoe .5323: 5.2 m8 2: do seamen. 383 «an .3...

...g

 

 

3 .8 .88

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

67

for service during an output packet operation. While DOUT register contains the switched out-
put data packet and TSC provides addressing used by TDS for demultiplexing the appropriate
output data packet to DOUT. It is important to note that INBUS and OUTBUS are bidirection-

ally gate connected to the processor data bus (DBUS).

The DBUS is bidirectionally gate connected to a non-addressable HOLD register, data
memory (DM) module, and addressable internal use registers (IR). The HOLD register tem-
porarily stores data during processor move operations. While DM is used primarily for writing
in and reading out data packets via the HOLD register during the time slot interchanging pro-
cess. However, some parts of DM are also used as a scratch pad in conjunction with condi-
tional move operations. The IR registers are the program counter (PC), the jump to subroutine
(JSR) and index (X). The PC register contains the address of the next move instruction in the
processor program memory. The JSR register stores the return address in jump to subroutine
operations and the X register is used for index addressing, and up-down counting. The remain-
ing IR registers, S and C, are additional index registers. However, it was found that they were

not needed in this simulation so they are not fully implemented for index addressing.

Other gated DBUS connections consist of outputs from the decrementer and incrementer
Operators and the inputs of the N ﬂag bit register, decrementer and adder operators. The N ﬂag
bit is used by the TDS control unit to test for conditional moves. The N ﬂag bit in conjunction

with conditional moves is described in the TDS control unit simulation model in Section 5.1.1.

Some of the remaining data paths consist of connections from the memory address register
(MAR) to the memory address decoders of the input, output, data memory and internal-use
registers. These decoders are not explicitly shown in Figure 5.1. However, these decoders
enable gates if the MAR value is in the address space deﬁned for that decoder. In this simula-
tion the internal registers are given memory address locations 0-7, data memory has locations

8-63, the input registers have 64-71 and the output registers have 72-79, Figure 5.2. Gate

68

 

 

 

 

 

80
Output
72
Input
JH__
Data Ila-cry
8
Internal Use
0

 

 

Figure 5.2 TDS simulation model memory

map.

69

signals GIR, GDM, GIN, and GOUT respectively gate the internal registers, data memory,
input registers, and output registers data onto and from the DBUS. The remaining decoder sig-

nals select speciﬁc registers within these groups.

The MAR has gated input data path connections from the destination address register (A),
program memory buﬂ‘er register (AR) and the output of the adder (ADD). Gating A into the
MAR is a result of an indirect move, gating AR into MAR is a result of a direct move and gat-

ing ADD into MAR is a result of an indexed move.

The few remaining data paths consist of gated connections from the HOLD register to the
incrementer input and program counter register connections to the incrementer input and out-
put for incrementing the PC. And ﬁnally, we have the data path gated connections from the
program counter to the program memory address register (PMAR) and bidirectional program
memory data bus (PMDATA) with gated connections from the program memory (PM) to the
program memory buﬂ'er registers (AM and AR). The program memory contains the TDS
switching program consisting of only move instructions. A move instruction consists of an
addressing mode and address. The AM register holds the addressing mode value in the From
and TO cycles of a move instruction. In the FROM cycle these addressing modes are immedi-
ate, direct and indexed. In the TO cycle, these modes are direct, indexed, conditional and
indirect. The AR register holds the address value in the FROM and TO cycles or in the case of
immediate move the value of the data to be moved. As a result, the AR register has a gated
connection to the DBUS for sending data to the HOLD register. Both AM and AR are inputs
to the TDS control unit which performs the address evaluation and generates the timing and

control signals necessary for all move operations.
5.1.1 TDS Control Unit

Figure 5.3 shows a block diagram of the TDS control unit. The control unit consists

of a combinational logic network and memory. The inputs to the control unit are: the least

70

 

 

 

 

 

 

 

 

 

 

 

 

 

.HMRCD
n —#
.___... Bartram. y
..___.
so CODINA‘I'IGIAI.
s :D were
AR___. crooner 81 a nerve».
:D-
o
1 cu
sun:
aacrsrra

Figure 5.3 Block diagram of the TDS control unit.

71

signiﬁcant bit of the program memory address register (PMAR(5)), the addressing mode bits
(AM), the N ﬂag bit, the decoder outputs (8,, S ,, SS) and the control unit state register out-
put, (2,, 2,, 22). The control unit outputs consist of control signals (Y), and the control sig-
nals Do, 0., 0,. The Y control signals gate in both space and time data paths in the TDS net-
work. The D control signals generate the next state of the control unit. The least signiﬁcant
bit of the program memory address register, PMAR(5), determines the cycle in the move
Operation. If this bit is logic 1 the move cycle is identiﬁed as FROM and if this bit is logic 0
the move cycle is identiﬁed as T0. The AM bits are interpreted by the control unit so that the
appropriate gate signals are generated for the type of addressing that is required. For example,
in an indexed move the control unif must also generate the control signals that gate inputs to
the adder from DBUS and the output from the adder to the MAR as well as the normal
read/write memory control signals. In Figure 5.4, Tablel shows the addressing modes as a
function of the AM bits and move cycle. The N bit is used as a test signal in conditional move
operations and is the most signiﬁcant bit of each word moved. This bit is gated into the N bit
register off the DBUS during the TO cycle of every move operation. In a conditional move, if
the N bit is logic 1 the moved data is written into a memory location and if the N bit is logic 0
the moved data is not written into memory. In both cases the FROM cycle of the move opera-
tion is performed, however in the TO cycle the N bit condition is tested. The S0, S I and S 5
inputs to the control unit identify internal use registers that are involved with move operations

such as increment, decrement and jump to subroutine.

The network speciﬁcation table shown in Figure 5.5 gives a detailed logic description of the
properties of the control network. It shows for all the inputs to the control unit all the space
and time outputs the control unit produces. For example, if the control unit is in state 0,
Z, -Z 1-21-0, the contents of the program counter, PC, is gated into the program memory
address register, PMAR. The clock advances the control unit to state 1, 2,-21-0, Z;-=l.

During state 1 the mode and address contents of the program memory are placed in the AM

From

To

72

 

 

 

PMAR(S) AM AM AM AN AM
000 001 010 011 100
0 Immediate Direct Index Unused Unused
x
l Unused Direct Index Conditional Indirect
x

 

 

 

 

 

 

Fig. 5.4 Addressing modes for the TDS network
Cdmove processor.

 

73

 
    

     
    
     
 

       
  

       
 

  

    

 
 
 
 

  

  

 

 
  

 

Fig. 5.5 Control unit logic network specification table.

74

and AR registers and based on the cycle and addressing mode the move operation is executed.
If we assume that the addressing mode is direct (AM o-AM,-0, AM 2-1) for both cycles and
we are in the FROM cycle, PMAR(5)-l. The next clock pulse advances the control unit to
state 2 (20-0, 21-1, 22-0) where it gates the FROM address in the AR register into the
MAR register. The next clock pulse advances the control unit to state 3 (lo-0,21-Zz=l)
where it gates the contents from the address memory location speciﬁed by the MAR to the
HOLD register and also gates the data paths necessary to increment the program counter. The
next clock pulse advances the control unit back to state 0, where the next three clock pulses
cause the next program memory word to be fetched and the MAR to be loaded. However,
when the clock advances the control unit to state 3 the cycle will be TO, PMAR(5)-0, so the
control unit gates the contents from the HOLD register to the memory address location
speciﬁed by the MAR and increments the program counter. Figure 5.6 gives examples of all
move operations possible with the simulated TDS network C-move processor. In these exam-
ples assume initially that memory address location 15 has the value 20 and memory address

location 17 has the value 30.
5 .2 PE Simulation Model

The block diagram of the data paths for the PE simulation model is shown in Fig-
ure 5.7. The data paths consist of DBUS, registers, communication interface logic unit (CILU),
arithmetic logic unit (ALU), program memory, data memory and a C-move processor. DBUS
is the PE data bus. All data moved to or from data memory, registers, CILU and ALU is
placed on DBUS. The registers are partitioned into groups called an outgoing communication
port and an incoming communication port. These ports are identical but are used for diﬁ‘erent
purposes. The outgoing port transmits request signals and data and receives acknowledge sig-
nals. The incoming communication port receives request signals and data and transmits ack-

nowledge signals. Since both ports have the same hardware for communication, only the

75

Inn-diate nave

lSdll-7l
after nve (151-7

Direct nove

151-317
after mite (lS)-30

INDEXED nove

(sl-S
15 G-lZIx)
after nove (15)-30

lO(x)d-l7
after nove (15)-30

S - indirect
l - innndiate
(x)- index

t - conditional

Fig. 5.6 Examples of TDS

Conditional nave

N-l
15¢ rlF- 17
after nove (15)-30

I'D

15¢ «II- 17
Ifter nove (151.20

INDIRECT aove

(A1315

SA-nI-' 17
after nove (15)-30

Initially:

HAR(15 ' 20
HAR(l7 ' 3O
HAR(X) I 5

processor move operations.

76

.Hovoe coﬁumasaam

<9..:::.

:89. 9232
3".

..I... .P .25....

qm< mm ago we amuwmwv xoon

an.m .waa

 

q
-
.
.
.
.
.
.
o
.
.
.
.
.
.
.
.
.
.
r

‘------------------J

77

.Hovoe acqumaaawm

na< as use we been eh.m .waa

 

78

outgoing port will be described.

The registers that make up the transmitting section of the outgoing communication port are
TINFA, TINFB, TSWADA, TSWADB, DINA, DINB, TAA, and TAB. All of these registers
are gate connected to DBUS on their input side and gate connected by multiplexor gates to
their respective time division multiplex bus on their output side, Figure 5.7. TINFA and
TINFB are respectively the A and B frame transmit ﬂag bit registers. These registers signal
requests for service to the switch during the A and B frame PE time slots. The request for ser-
vice is basically an enable signal that informs the switch if it should perform or not perform a
switching operation on its current input data. Since each PE has its own unique time-slot to
seize the switch there is no chance of collision between PEs requesting for switching service.
This helps to improve the throughput of the system. TSWADA and TSWADB are the A and B
frame switch address registers. These registers are initialized by the PE with an address value
of a location in the switch program memory. This location in the program memory contains
move instructions that perform the switching operations on the switch input data. In our inves-
tigation the switch operations will be limited to moving the input data directly and indirectly to
the switch data memory module. Direct and indirect move operations were described earlier in
Chapter 3. DINA and DINB are respectively the A and B frame input data registers. The input
data registers contain the data that is to be switched. Each of these registers is composed of
three registers with separate gated inputs and a common gated output. The registers are labeled
TRES, TAG, and TSIG with A and B subscripts added to each name to signify their association
with either A or B frame. The TRES register contains the result of a previous PE computation.
The TAG register contains a value used to identify which operation this result is to be used
with in the next step of computation. The use of a TAG is required when the PEs are pro-
grammed to perform multiple operations or operations in which data values are reentered into
the data ﬂow program. In this simulation each PE is programmed for a single operation and

data ﬂow programs that use feedback are not simulated. As a result, the TAG bit is shown but

79

is not actually used in this simulation. TSIG is the signalling bit register used to transmit the
idle and request PE signalling states. Transmit registers TAA and TAB are respectively the A
and B frame data packet destination address registers. These registers contain an address
representing the same or diﬂerent time slot position of the PE. The switch uses this address to

carry out the time slot interchanging operation on the input packet data.

The receiving section of the outgoing port consist of OUTFA, OUTFB, DOUTA and
DOUTB. OUTFA and OUTFB are respectively the A and B frame receive ﬂag bit registers.
These registers are sensed by the PE to determine if new receive data is available. For the out-
going port the receive data consists of tag and PE signalling. For the incoming port the receive
data consists of operands, tag and PE signalling. The receive data is contained in the DOUTA
and DOUTB registers, respectively, for A and B frame data. The receive data registers have a
single gated input, whereas the output has a separate gated connection to DBUS for each ﬁeld

in the register, Figure 5.7.

The CILU data paths generate the outgoing and incoming PE transmit signalling bits as well
as the PE execution enabled signal. The inputs to the CILU consist of the outgoing and incom-
ing receive signalling hits, the ALU status bit and the current state of the PE. The receive sig-
nalling bits were discussed earlier in this section, the ALU status bit, PST, is either in the idle
or busy condition, the PE state information is stored in a register and new state values are gen-
erated based on the ALU status, the receive signalling conditions and the previous PE state
value. However, no state changes can occur until the CILU has received a clock pulse gen-
erated by the PE control program. One clock pulse is generated each time the PE cycles
through its control program. The various PE states and explanation of these states is given in

Figure 5.8.

The CILU is modeled at the logic gate-level. Figures 5.9 and 5.10 show the truth table

representation of the CILU design. The CILU was decomposed into an outgoing and incoming

80

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

PE State
Outgoing Incoming Explanation
P080 P051 P152 P153 PE State
0 O D O idle
O O 1 received operand A
O O O 1 received operand B
O O l l enabled
1 l O 0 results available
1 l l 0 results available and received operand A
l l O 1 results available and received operand b
l l l 1 results available and received operands A S I
0 l O O A frame results available
1 O O O 8 frame results available
0 l l O A frame results available and received operand A
O l O l A frame results available and received operand l
O l l l A frame results available and received operands
A and I
l O l O 8 frame results available and received operand A
l O O l I frame results available and received operand I
l O l l I frxme :e;ults available and received operands
an

 

 

 

 

 

Fig. 5.8 PE states for data flow computing.

 

81

 

 

 

 

 

 

 

 

 

 

 

 

 

mm OUTPUTS

Outgoing Outgoing Outgoing

Garrsnt Iecsive ALV Transmit
State Signalling Status "Seat State Signalling

P050 P051 POIA P0“ P51 P1520 P153 P050 P051 POTA POTS ENABLE

__,.__1

o o X 8 S O O 0 0 O O
O O X 3 O l l l O O l
O O X S l l O O O O O
l l O O O X l l l l O
l l l O O I O l O l O
O l ‘ l X 3 O O O O O
l 1 X X l X l l o o o
l l O l O ‘ l O l O O
1 o 1 X X X o o o o o
l l l O X 0 O O O O
o 1 x o X X o 1‘ o 1 o
l O O x X x l O l O O

 

 

 

 

 

 

 

 

 

 

 

 

 

Fig. 5.9 Logic truth table for the outgoing part of
the CILU.

82

 

 

 

 

 

 

 

 

 

 

 

 

IlPUTS
OUTPUTS

Incoming Incoming Incoming
mrrsnt lecaive ALU Trans-it
State gignalling Status Part State Signalling

I 2 P 3 RA PIRB PST POSO v P051 P152 P153 PITA PITB
liq—La !-—-r—.

O O O 0 I I 0 0 O O

0 0 1 0 I I 0 l 0

O O O 1 I I 0 1 O l

1 O I O I I l 0 O O

l O I l I I 1 l O l

0 1 O I I I 0 1 O O

O 1 1 .I I I l 1 1 O

1 l I I 0 O O 0 O O

1 1 I I l I 1 l O O

O O 1 1 I I 1 l 1 l

1 1 I I O 1 1 1 O O

O O 1 1 I O l 1 l 1

 

 

 

 

 

 

 

 

 

 

 

Fig. 5.10 Logic truth table for the incoming
part of the CILU.

 

83

combinational logic designs to reduce the size of the network, since many of the inputs that
affect the incoming network have no effect an the outgoing network and vice-versa. The ALU
performs the arithmetic operations in the PE. High-level functional modules were used to
simulate the ALU. High-level modules were used because the speciﬁc operation of the ALU
was not important in analyzing the operation of the TDMP architecture. However, the capabil-
ity to move input operands to and results and status information from the ALU is important to
the operation. As a result in this model the ALU has two input operand registers called A and
B that are gate connected to DBUS. Outputs from the ALU consist of a gated connection from
the results (R) register and the status (ST) register to DBUS. The remaining data paths for the
PE are the C-move processor, program memory and data memory. These data paths are exactly
the same as those described in Section 5.1 for the TDS network. The purpose of the processor
is to move under stored program control data and signalling to and from the data paths in the
PE. In this simulation the data memory is used as a scratch pad by the processor during condi-

tional move operations and the program memory contains the PE control program.
5 .3 TDMP Simulation Model Operation

In describing the TDMP simulation model operation, we will assume that the PE
and TDS program memories have been initialized prior to program execution. We will begin by
discussing the PE simulation model operations followed by a discussion of the TDS simulation
model operations. We end this chapter with a summary discussion of the overall TDMP simu-

lation model.

5.3.1 PE Operation The ﬂowchart shown in Figure 5.11 and the program code
shown in Figure 5.12 describes the programmed control operations of the PE. These instruc-
tions are stored and fetched from the processor program memory. The PE begins by moving
the ALU status, the incoming port A and B, and outgoing port A and B receive signalling bits

to the incoming and outgoing communication logic networks, respectively. The processor

84

 

@.

 

 

    
 
   
    
     
 

  

  

 

 

 

 

 

Y
MOVE RESULT
0 A OTC PORT
HOV: 016
S TXHT SIG
TO OTC PORT
mm on ' N
ICV A $10
To CILU
Y
"0' 0
MOVE RESULT
ICVS 516
HOV! INC .
:3 {ﬁﬂéﬁ "°‘°‘
s . E
. Wits:

 

 

a rssr-r s

 

r
OPERAND .A 521 ALL‘ 521 u'
70 Au svsr IDLE

 

 

 

 

'.' \

 

B TINT SIG HOVE STATES
TO INC PORT TO CILU

G
H
S .

O

u
at
0:0“ acv ,
twin n
to ALU

vr"‘~ .v

m: i691:

 

 

 

Fig. 5.11 Flowchart of the PE control algorithm.

85

.ovoo anuwoua Houucoo mm NH.m .m«m

aahu cu usuauncuun <>u¢ oe« csoe 0» cans
cuxoo~u

.uxnu aahu xoopu

sapu o» nausea e>oz

puaneu new

aghu o» cannon usoe o» neon

Duane» aom

puonen at» 0» gene a-oco.a.ccou
comaaoo— ante o» Aumxumv enaeee csox
nauanet n unmouuao o» naunnoa cs0: anpccoauuccoo
nun < hmxh ucuougno on who; agnu

no—anou < mcqouuno o» nunnnoa osos annoeoquuecou
no» < asap ecaoegao on are; aahu

=U¢ o» tacueno m at. cs0: unannouuumcou
man a bags as“ on nae; =4_u

auc oa acetone < on“ osoa anumcoauqncou
u.u ¢ hth on. o» ehaa aaau

:apu 0» u—n =>uz ccuomaao use:

ammo o» a.» ¢>ux cc_oca=o use:

ammo 0a man a>ux on“ e>ox

some o» c:.~.a=u.a <>uz as“ sac:

aanu oa anucun esox

om.o -v 9..
a... -v .25..
—.o -v ao..

mm.._ -v ma..
..o -v ~N...

mo.c -v c..
c.c -v NN...
.a.o -v c.m
co.— -V om.—
om... -v he..m
mo.. -v mo...
om... -v 2o..m
mo.. -v cc...
ma.. -v am..m
ca.. -v ma.—
ce.. -v mm..m
cc.. -v as.—

.w... -c .a..
c.—.. -v ca.—

mm..
«c..

-v aw.—
-v 0a..

mm... -v mo..
acumen“ mm

oo..ao

oo.»a
oo.mm
ao.mo
~a..o
oo.mc
ca.p~
oe.mc
so.mc
~c..a
co.o~
asap»
cp.m>
=p.n>
mp..p
cp.ao
co.po
oo.mc
ao.mo
~o..o
oo.om
wm.>m

 

unoutv¢

In

86

moves the generated incoming port A transmit signalling bit (ITA) from the CILU to the
incoming port A transmit signalling register, ITSIGA. If this bit is logic 1, the processor moves
the incoming A operand from IDOUTA to the ALU A input. If this bit is logic zero the
incoming A operand is not moved to the ALU. The lTA bit represents acknowledgment of
receiving the incoming A operand. Similarly, IT B is moved from the CILU to the incoming
port B transmit signalling register, ITSIGB. If this bit is logic 1, the processor moves the
incoming B operand from IDOUTB to the ALU B input. If this bit is logic 0 the incoming B
operand is not moved to the ALU. Next the processor moves the outgoing port A transmit sig-
nalling bit, OTA, from the CILU to the outgoing port A transmit signalling register, TSIGA.
OTA represents a request to send results as determined by the communications logic network.
If this bit is logic 1 the processor moves the results from the ALU output, R, to the outgoing
port A transmit results register, TRESA. If this bit is logic zero, the results are not moved.
Similarly, the processor moves the outgoing port B transmit signalling bit, OTB, to the outgoing
port transmit B signalling register, TSlGB. If this bit is logic 1 the processor also moves the
results from the ALU output, R, to the outgoing port B transmit results register, TRESB. The
processor now tests the communication logic network enable execution, EXEC. If this signal is
logic 1 the processor moves an arithmetic instruction and start execution signal to the ALU
input, INST. If EXEC is a logic 0 the arithmetic instruction and start execution signal are not
moved to the ALU. The processor now clocks the communication logic network so that it can
generate its next state based on its current input and outputs. The processor jumps to the

beginning of its control program to repeat this cycle of move instructions.

5.3.2 TDS Operation The ﬂowchart shown in Figure 5.13 and the program code
shown in Figure 5.14 describe the TDS network processor operations. The SYNC bit is a syn-

chronization pulse that occurs at the beginning of each time slot. This bit is used to

87

 

 

 

 

 

 

 

 

 

 

 

PDVE
TSCCTO

 

 

 

 

 

TO DOUT

 

 

 

 

 

 

Fig. 5.13 Flowchart for the TDS control algorithm.

88

.ovoo amuwoua Houucou may «H.m .wqm

.m.o -v c.. o~.o.
a~.o -v o.m ~..p.
2... -v om; 8.5

.cEcLu ex; ucuunca ecu coauneeeo one

one so“: a as one» egg a~ucuue »— access lesson; 9 ”coo: -

pmh A» hm « vmh
cmh nu mm e um

 

cat» as cease; ... o.o -v c.. o=.o~
cute see so.» seek ... mm.. -v =>.. m.pm
seen a; caste: ... a.o to o.. om.mm
use: ores so.» seep ... ~m.. -v as.. :m.mm
anon o» gazes: ... o.o -v o.. ~m..m
on“ e an coacaoean coesncoa an on so use: ... 50.. -v oo.= om.o~
any so evacuees” coasaoo. o» acne ... ms.. -v o.. m~.p~
uzum o» cease. .cuu=_ cH ... a.c -v o.. o~.m~
nae» an evacuees» coaasoo_ as asaa ..uuzn a. ... eo.. -v o.m =~ .mm
so.»ouo~ one» as Ana—v cage sane. osox ... mo.. -v o~.. .ow..~
unease. sundae a» case ..uu ca ... a~.o -v o.. o~.a. .
cc.s:ot sauce a» scan ..»c c_ ... _~.o -v o.m o..». .
co.auoo_ ones on test. ssoz ... op.. -v om.. o..m. .
ocam a» grass: ... a.o -v o.. a..m.
«sate one» an ass“ ..uuz»m c_ ... m..o -v c.m ~....
coasaoou one» ca.5.a uzpm use: ... a».. -s o~.. c..a
uaoteoe a;

atoscx Beacon; a

89

synchronize the TDS with the input and output time-division multiplexed buses. The TDS pro-
cessor moves this bit to a scratch pad location in data memory. If SYNC is a logic one, each of
the TDS processors move the FRAME bit to a scratch pad location in their respective data
memories. If SYNC is logic 0 the TDS processors continuously moves the SYNC bit to the
same scratch pad location in their data memories until SYNC is logic 1. If the FRAME bit is
logic one, TDS processor A performs the operations in the input processing branch of the TDS
ﬂow chart and TDS processor B performs the operations in the output processing branch of the
TDS ﬂowchart. If FRAME is logic zero processor A performs the output processing operations

and processor B performs the input processing operations.

The input processing operations begin with a test of the INF bit. The INF bit is moved to a
scratch pad location in memory. If INF is logic one the processor moves the contents of the
SWAD register to the program counter register. The instruction pointed to at this address is a
direct move or an indirect move of the DIN register contents or some other type of requested
switching service function. For this example, assume the instruction pointed to by the PC is an
indirect move, then the processor moves the contents of the DIN register to a data memory
location speciﬁed by the contents of the A register. The program now jumps back to the test
SYNC instruction. If the INF bit was logic zero the program does not perform any switching

service and immediately jumps back to the test SYNC instruction.

The output processing operations begin by testing the OUTF register. If OUTF is logic zero
the contents of TSC are moved to the processor program counter. The instruction pointed to at
this address causes a jump to a program memory location that directly move the contents from
a location in data memory to the DOUT register and returns to the test SYNC instruction. If
OUTF is logic one the processor immediately jumps hack to the test SYNC instruction without

moving any data to the DOUT register.

90

5.4 TDMP Simulation Model Summary

Because of APL space limitations, all of the PE variables needed to simulate a
twelve PE TDMP architecture could not be created and stored in one ﬁle. However, by using a
set-up and end-up program, we were able to achieve the same effect. The set up program takes
shared variables such as program memory, incoming and outgoing port registers, etc., and
makes them speciﬁc variables for a particular PE by adding a sufﬁx to these variables. When
done computing the end-up program stores the contents of these speciﬁc variables and removes
the subscripts to make them shared variables again. Each time a PE is used it must use the set

up and end-up program.

The APL simulation model provides us a means of simulating the TDMP system operation;
however, timing problems can occur since APL simulation does not include timing simulation
methods which could predict timing problems. A timing simulator allows veriﬁcation of proper
circuit behavior in the presence of variation in gate delays. The worst case circuit behavior is
obtained based on the minimum and maximum transition delays assigned to the gates is the
model. By allowing observation of the circuit whenever simulation time is incremented rather
than waiting or assuming the circuit has stabilized, a better understanding of circuit operation
may be obtained to aid in circuit design or diagnosing problems. Detail logic gates and timing
simulation can be implemented in the APL model by writing programs that perform these func-
tions. However, because these functions are not built into the APL simulation the cost for

implementing them in our model is expensive.

APL is a sequential simulation language, which means it cannot simulate concurrent activity.
However, we can achieve the effect of performing concurrent activity by stopping the simula-
tion timing clock and sequentially performing computations then restarting the clock again.
This is an awkward and expensive way to simulate concurrent activity; still, it is the best alter-

native available. But even given these short comings of the simulation language, it does allow

91

us to obtain some useful and signiﬁcant information in predicting TDMP‘s system performance.

This performance will be discussed with the aid of sample computations in the next chapter.

CHAPTER VI

TDMP PERFORMANCE EVALUATION

Chapter 5 described the TDMP APL simulation model and its operation. In this chapter we
evaluate the performance of the TDMP design based on this model. In our investigation we
did not design many evaluation tools to measure system performance. For example, we didn’t
design any hardware monitors for measurements of hardware activity, software monitors for
event recording during program execution nor did we develop workloads that represent work
expected of the system. We feel that a rigorous performance evaluation of multiprocessor sys-
tems employing these techniques is a research project unto itself. In addition there is a ques-
tion of whether new performance tools are needed or how do the currently used performance
tools need to be extended to be applicable to multiprocessor systems. As a result, we relied pri-
marily on monitoring signals and registers in simulated clock time for recording the activity of
small portions of the system and monitoring computational results in simulated clock time for
measuring total system performance. Section 6.1 discusses the qualitative and quantitative per-

formance results obtained from this simulation model.

On the one hand, we have the conventional single-processor systems which have many uses
but the range of use is limited by the von Neumann bottleneck. On the other hand, we have
ﬁxed-array-processor systems which overcome the von Neumann bottleneck but at the expense
of limiting its range of use by the ﬁxed interconnection structure. TDMP eliminates or
improves upon the range of use limiting factors in both single-processor and
ﬁxed-array-processor systems. In Section 6.2 we use data bandwidth as a measure of computa-
tional performance and compare the data bandwidths of a single-processor, TDMP, and
ﬁxed-array-processor systems in computing a fast-Fourier-transform (FFT) and a digital ﬁltering
algorithm. This comparison is done to show how much TDMP improves arithmetic computa-

tion over single-processor systems and how much data bandwidth we must sacriﬁce to obtain

92

93

ﬂexibility over ﬁxed array processor systems. The two algorithms chosen for the comparison
represent the important class of recursive algorithms used in digital signal processing. In Sec-
tion 6.3, we discuss what has and has not been learned from the simulation. And, ﬁnally, Sec-
tion 6.4 describes the differences in TDMP data ﬂow processor and Dennis’s data ﬂow proces-

501137

6. I Performance Data

The performance data obtained from the simulation model is based on a communi-
cation saturated TDMP system with one processing element. By communication saturated we
mean that the system bandwidth is dominated by the interprocessor communication and the
processor arithmetic execution time is negligible compared to this communication time. With
this approach the bottlenecks, delays and throughput we examine are a result of the TDMP
architecture only. The PE computes results and transmit these results back to itself through the
TDS network. Figure 6.1 shows a block diagram of the simulation model with the PE shown in
two parts for clarity purposes. In the model, time slot zero, TSO, identiﬁes the outgoing port of
the PE and time slot one, TSI, identiﬁes the incoming port of the PE. With this model we
were able to test the communication protocols between PE outgoing and incoming communica-
tion ports, the A and B frame submultiplexing. the time-division switching network and the

data ﬂow control.

The simulation model is synchronized to a master clock. Four simulation clock pulses
correspond to one read/write memory cycle, rm. From the simulation we found that the max-
imum rate a PE can generate result packets, Tm , is once every three cycles through its control
program. The ﬁrst cycle loads the operands into the ALU, the second cycle initiates computa-
tion and the third cycle transmits the results. In the PE control program shown in Section 5.3,
this corresponds to 120 t... assuming that the needed operands have been received and the

ALU execution time is zero. However, if the operands are not immediately available the time

94

.Hmvoa mink mm
oawcwm n no cowumucumoumuu amnwmwv suedm H.o .wam

mo .mTIIIIIIuhm

xum
mm .mTIIIIIliho

some
umoavou

 

m... as: o E IIIII! as: E

 

IQIIIII, alllllj
mo m5 8

 

 

 

 

 

 

 

 

 

 

95

delay in receiving these operands reduces the rate at which the PE generates result packets.
This time delay in receiving operands is attributed to the transmission, switching, and packet
ﬂow control. The worst case transmission delay, 1; , is caused by the time-division multiplexing
and submultiplexing of the information packets. In the model T, - 2NTX, where N is the
number of time slots, TX is the time slot size, and 2 is the number of submultiplexed frames.
The worst case switching delay, TD, also equals ZNTx, since in the model the switch sampling
period equals the time slot size. The transmission and switching delays are ﬁxed and each
packet transmitted must sequentially experience these delays. This results in a pipeline realiza-
tion of packets with the sections of the pipe consisting of the packet generation, transmission
and switching. Without including the packet ﬂow control and assuming worst case, we found
that the model computing with two input operands produces the ﬁrst result packet in TN +
T, + TD units of time and the following result packets every T units of time where T is the
maximum of (TN, 7}, To). The timing diagram shown in Figure 6.2 is the one used for tim-
ing and control of the two time slot model. From this diagram we ﬁnd that the time slot size is

98 simulation clock pulses which correspond to 24.5 t,,. If we assume t,,, - 30 nsec, then

TD - T, - (2)(2)(24.5)(30 nsec)

- 2.94 nsec

and TN - (120)(30 nsec)

= 3.6 nsec.

As a result T - Tm since TD - T, < Tm. For a large number of operations this

corresponds to a data bandwidth of 277 kHz where data bandwidth is deﬁned as the maximum
number of results that can be generated per unit time, on - iT' However, if N > 2 time slots
then To = T, > TN and T 8 TD - 1.47N nsec, resulting in bl, -= 1/1.47N nsec.

If the packet ﬂow control is added to the model, the previously computed on decreases

Ammo—e uoam 9:3 95 u now Sammie wad—.3. N6 .mwm

 

'-
Allla --ili .-.-iil. ...: 1.

 

 

 

 

{em

 

 

 

 

 

 

..IIII .... Iljﬁlll .... Illll S IJTI ... I4 _

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

96

 

 

 

 

H
Fa . _ .. E
Ll ..
a

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

1|: |41I ..Iill .l11l I._
.1 H ,..
TIII $1|JJII|§IIIJIIIII 5 1"." .3"—
. .1 . ~ ....
1 We.“ “EIIIIIIJ
~ L o .
...l .
... . z... . ... . e.

97

signiﬁcantly. The decrease in bandwidth is due to the interruption of the pipeline operation and
added delay both caused by the packet ﬂow control protocol. The packet ﬂow control protocol
is responsible for preventing new result packets from being generated before previously
transmitted result packets have been acknowledged. Since the acknowledge signal is returned
after the request signal and data have been received, the time between generating new result
packets increases by the amount of time it takes the receiver PE to generate an acknowledge
signal plus the time it takes the transmitter PE to receive it. The worst case time for generating
an acknowledge signal, T, , is 1 cycle through the PE control program or 40 t,,. This time
plus the transmission and switching delay time experienced by the acknowledge signal is added
in the packet generation, transmission and switching time of the result packet sent. As a conse-

quence

T-Tm-l-2T,+2TD+T,,,,,

= (4.8 + 2.94N) nsec
and from the model where N = 2
b - J- -- 93 6 kHz
D T . .

If sufﬁcient size receiver ﬁrst-in-ﬁrst-out (FIFO) queues or receive buﬁ'ers with arbitration

circuitry are used to store and control incoming packets to the PE. then the packet flow protocol

 

MHz if N > 2 time

can be omitted and the b0 is increased to 277 kHz for N = 2 or 1 417 N

slots. These are the data bandwidths we obtained for the model when we didn’t consider the
packet ﬂow control. The cost of the bandwidth improvement is a decrease in system reliability
because the acknowledge signal used for packet ﬂow control was our only veriﬁcation that the

transmitted packet was received successfully.

98

6. 2 Comparison of Computational Performance

We decided to use the performance data obtained from the simple one PE model of
TDMP to determine and calculate the (data) bandwidth of a large TDMP system suitable for
implementing FFT and digital ﬁltering algorithms. This decision was based on the high cost of
the TDMP APL simulation and the small amount of new information that would be gained in
simulating a large system. This single PE model is justiﬁed because TDMP is a synchronous
system and there is no contention for communication. Each PE is given a designated time slot
to transmit and receive data. As a result all PEs are equal and we can determine on a worst
case basis the time required for each PE to compute, transmit and receive results. This is
different from an asynchronous system where the time for each PE to transmit and receive
results varies and depending on the type of interconnection network there may be some conten-
tion for communication. The (data) bandwidth was chosen as the performance index in com-
paring the computational performance of TDMP, single-processor and ﬁxed-array-processor sys-
tems. This choice is more appropriate than the memory cycle time or instruction execution
time since it allows for the fact that instructions may be executed concurrently. The evaluation
of the bD’s is based on each system performing the same FFT and digital ﬁltering algorithms
with comparable processors. We begin this analysis with a comparison of the bD’s resulting

from the processing systems performing an FFT algorithm.

The FFT algorithm implemented on each processing system is based on the decimation in
time principle and is illustrated for an 8-point sequence in the signal flow graph shown in Fig-
ure 6.3. In the signal ﬂow graph, there are four columns and each column contains eight
entries. For the sake of clarity, the two-dimensional variable y(k,i) is used to denote the value
of a given node in the array, where k is the number of the column and i is the number of the
component within the column. At the node corresponding to column k and row i, the variable

y(k,i) is found from an equation of the form:

O

i/K
X(o)

X(l)

X(3)

X(é)

X(S)

X(6)

X(7)

99

 

 

 

 

 

 

 

 

Fig. 6.3 Signal flow graph of a 8-pt. FFT.

100

Y(kvi)"J’(k-l.ir)+W'Y(k"1.i2) (l)

where r", i; and r are functions of the location within the array and W - (”2"”). In each
use, the dashed line connecting the variable in column k-l with column k refers to the ﬁrst
term on the right-hand side of Equation (1), i.e. the nonweighted term. The solid line refers to
the second term on the right hand side of (1), i.e., the weighted term. The number in the cir-
cle is the degree of W as indicated by term r in (1). The important parameters needed to deter-
mine and compare the (data) bandwidth among the three systems implementing this FFT algo-

rithm are deﬁned:
N -= the number of data points in the FFT
T5 - execution time of the processor in performing the butterﬂy computation.
ya - Yr + W'yz

M - the average number of FFT operations to be performed
Tx - time slot size

T - worst use TDMP system delay

t, - register delay

t,, - read/write memory cycle time

on (M) - data bandwidth for a ﬁnite number of FFT operations

by -= data bandwidth when the number of FFT operations is large (M -> oo)

The on analysis assumes that all initialization of the processors has occurred prior to the
beginning of the FFT program execution and that each individual processor performs the same
computation, y3- y2+ w'yz, in the same amount of time. To compute y, each processor
must perform 4 real multipliutions and 2 additions. We also assume that the input and output

interfaces for each system is manageable and not a limiting factor in the b0 performance. With

101

these assumptions given, the on of a single-processor system, un be determined from the fol-
lowing equation,

Lint l

l- ”"00 bD'(M) - T5(N logzN) + 6t,,, (N logzN) (2)

b0
In equation (2) T5(N logzN) is the arithmetic execution time needed to compute one FFT
operation with a single-processor, 6t," (N log,N ) is the accessing and storage time of the real and

imaginary parts of the two input operands and the results during one FFT operation.

The TDMP by is based on a TDMP system with NlogzN PEs. Each PE is assigned a loca-
tion in the N x logzN pipeline array of processors as represented in the signal ﬂow graph shown
in Figure 6.3 for N - 8. Each PE is programmed to execute the butterﬂy computation as a sin.
gle data ﬂow operation and to transmit the results of this computation to the next column of
PEs based on its loution assignment within the array. This arrangement of PBS form an asyn-
chronous pipeline ﬂow of result packets with the synchronization of the computations con-
trolled by the data ﬂow principles at the architecture level. The worst use b0 for TDMP is

characterized by the following equations;

M
1’02”“ (T; + T) logzN + (1‘,E + T) (M-1) (3)

 

and

l

b Lim __
0 (T; + T)

, - Mm MM) - (4)
where T is the worst use TDMP system delay. T is (4.8 + 2.94N) usec if packet ﬂow control
protocol is used and 1.47N nsec if it is not used. These two uses correspond to not using and

using input queues as buffers in the PEs. The term (TE + T)log2N is the system delay, i.e.,

102

the time before the ﬁrst results appear at the output of the system. After the system delay, a
FFT operation result appears every T; + T units of time for an unbounded number of opera-

ﬁons

The fixed-array-processor directly implements the signal ﬂow graph of Figure 6.3 with
hardwired interconnections between the processors. Each interconnection path contains a regis-
ter to synchronize the data ﬂow between processors. The execution time, T E, and the register

delay time, t,, determine the processor on ,

M
(T; + t,)log2N + (T; + t,)M-1

 

50,01) '

Um 1

b0: '- M—ooo b150,) - m

Figure 6.4 shows a graph of the FFT data bandwidths for the single-processor, TDMP, and
ﬁx-array-processor as a function of the number of operations, M. The curves were generated

assuming that N-8, t,,, - 30 nsec, t, - 10 nsec and
T5 - (4TH + 2T4)

where Tu and T A are respectively the hardware processor ﬂoating-point multipliution time

(27 us) and addition time (14 as).

The curves in Figure 6.4 show that for a large number of FFT operations, TDMP on is
more than lS-times larger than the bp of a single-processor system and 1.5-times smaller than
that ' of a ﬁxed-array-processor in computing an 8-point FFT. The on of the
ﬁxed-array-processor un be viewed as an upper-bound and the maximum on improvement we
un obtain over the single-processor with TDMP. In order to achieve this maximum improve-

ment we must reduce the interprocessor communiution overhead either with hardware or

103

meoaueuuao Hum we Humane
oweuu>u any we :oauocam

m we nomuumaaou cutie—econ «.0 .wﬁm

 

m m
Aneouusuuao mo nunaaevz ooomcc— \Ibmwlmu >lmmmIMl mo~ cm

can

 

25:3 2.2:: N?

l I | I I \I‘
20:3 5:. 3.. I'lll
no:

 

 

acnnuUOunoaouunivux.u 1 man

eae-~9_
uoaaooOuato_we.a n —:e

I.doo.°~

 

.nzvna

104

reducing the communication protocols that are in software. For example the dash curve shows
the improvement in TDMP on if input queues are used. In this case TDMP on is more than
l9-times larger than the bp of a single-processor and 1.2-times smaller than that of a
ﬁxed-array-processor. This comparison shows that for a small reduction in on of
ﬁxed-array-processor, TDMP enhances both single-processor and ﬁxed-array-processor range of
usefulness, e.g., increases b0 over single processors and adds ﬂexibility over

ﬁxed-array-processor.

Figure 6.5 shows the bp’s as a function of the processor execution time for large M. From
these curves we un see that the single-processor system has a larger on than TDMP if the exe-
cution time is less than 2.8 as and as the execution time increases above 2.8 as this situation
ﬂip-ﬂops. From the same curves we see that if T; is greater than a 1000 us, the bp of TDMP
and ﬁxed-array-processor is about the same. These results are very important and can be easily
explained. In the ﬁrst use where TDMP on is less than that of a single-processor, the PEs in
the TDMP are spending most of their time communiuting and waiting for new operands
because the computation time to communication time ratio is small (<.04). In the second
use, where the TDMP b0 is comparable to the ﬁxed-array-processor, the computation time to
communication time ratio is large (>13) so that PEs spend the majority of their time comput-
ing instead of idling. This points out the importance of granularity in TDMP. Granularity is
the size task a PE must perform before it is required to communicate with other PBS in the sys-
tem. For TDMP the granularity must be large such that the computation time to communicate

time ratio is large for a given problem.

To illustrate the advantage that TDMP has over ﬁxed-array-processors for this class of prob-
lems, we simply change the appliution problem. Changing the appliution problem in TDMP
requires no hardware changes. We simply reprogram the data paths and input new information

packets with new tag values, assuming of course that the program for the new task has been

105

.cowumuaanu Em menu new use
coausooxo mm we cowuunsm a mu cognac—ace pugs—Ema m6 .wE

 

$5 on:
892: 8c...— ccc; E: c.
b p p p p
d a d d 4
11°.—
1163
coo;
u
....
n «a 168.2
a a
nonnuuouaranuuaivouuu t a
N
.52. - .... no
u: a u a
nonnooounruumcun t a A 5 a

 

106

previously stored in the PE. However, changing the appliution in a ﬁxed-array-processor
requires changing the hardwired interconnection paths between processors or possibly
experiencing a signiﬁunt degradation in the on. In addition if the problem size changes
unneeded PEs in TDMP architecture un be utilized for multiprogramming purposes. Since
TDMP is based on data ﬂow principles and suﬂ'ers no side eﬂ’ects of concurrent processing, we
un execute two or more diﬂerent application problems simultaneously. There is no way in
which ﬁxed array-processors can do this. Beuuse in ﬁxed-array-processors the correctness of
the results requires all processors to be synchronized together. As a result, unneeded proces-

sors must remain idle, thus reducing the utilization of the ﬁxed-array-processing system.

As a second example of TDMP performance, we will compare the bD’s of the three process-
ing systems implementing the digital ﬁltering algorithm given in Chapter 3 and shown in Fig-
ure 3.8. In the single-processor system 4 multipliutions, 3 additions, and 8 memory accesses

and stores must be accomplished to compute one ﬁlter result. Thus

b - l
”I 4tu+314+81m

 

For the TDMP system, we assume N - 8 to make the architecture ﬁt the problem size and

directly implement the signal ﬂow graph shown in Figure 3.8. This results in

bng lu+T

 

This by assumes that many ﬁlter computations are performed and the multipliution time is
larger than the addition time. Similarly for the ﬁxed-array-processor system we assume an
architecture size to ﬁt the problem size and directly implement the signal ﬂow graph which

results in

107

 

b -
03 I” "1' I,

Figure 6.6 shows the bD’s for each processing system as a function of the processor multipli-
ution time. In this example we see that the bp of TDMP is larger than that of the
single-processor for all values of processor multipliution time greater than 15 nsec and less
than that of the ﬁxed-array-processor until the multipliution time exceeds 500 nsec. The start-
ing point of 15 nsec was used so that our equations for on; and on, will be valid, since tA < t”
is required for these equations to be correct. The results from this analysis again shows us the
eﬂ‘ect that communiution overhead has on the TDMP b0 and the amount TDMP enhances bD

over single processor systems.
6 .3 Discussion of the Simulation

The simulation model was the driving force behind designing the speciﬁc com-
ponents and the interrelationship of these components to make a TDMP system work. By
using a simulation model of the design, we did not have to purchase, build and test printed cir-
cuit boards and the LS] components on these boards to observe the performance and operation
of the TDMP system. With simulation we were able to test new ideas quickly and to make
changes easily. We also were able to develop the precise timing relationships for the system
operation and determine what inﬂuences these relationships. We did not ﬁnd out how TDMP
performs when the system hardware components are not ideal but vary with temperature and
age. We were not able to truly simulate concurrent PE activity beuuse the simulation proces-
sor was sequential. The upacitance of the bus data paths that inﬂuence signal propagation
could not be simulated inexpensively. To simulate this upacitance we need a stix layout of the
intergrated circuit design of the TDMP architecture in order to determine, at best, a good guess
of the bus lengths necessary to ulculate a upacitance value that could be simulated for each

bus data path in this model. This approach is very expensive, time consuming and not

108

.mcoaumunasoo wcauouawu Heuwwav may new maﬁa
newuouwﬁeauase we aoﬁuocsw u no nuvﬁavcnm 0.0 .wﬁm

 

 

 

..10 «gas
000.00“ 000.0“ coc~ oo~ m~
l» xvi . . _
a q . q a
1
ad
raou
18:. H
ma
uonuououarhnuuutvuuuu I a
. N 38.2
area i on
an
nonnooountuaucun r 0
78.2:

2.5....

109

necessary for this initial simulation model, so it was not done. However this should be done
when simulating the architecture for actual VLSI implementation to study its eﬂ‘ect on perfor-

mance.
6.4 TDMP and Dennis Data Flow Models

There are two primary differences in the TDMP data ﬂow architecture and the data
ﬂow architecture developed by Dennis at MIT.37 The ﬁrst diﬂ'erence is that in TDMP the data
ﬂow operators and memory cells are in the same PE module with an interprocessor communiu-
tion network used to interchange result information packets between PEs. In Dennis’s model
the operators and memory cells are separated. All information packets are transmitted through
an arbitration network from the memory section to the operators. And all result information
packets are transmitted from operators through a distribution network to the memory cells.
The TDMP approach does not increase the amount of concurrent activity over Dennis’s model
but rather allows for loulity of information ﬂow. This approach reduces the amount of parallel-
ism in a problem required for high PE utilization. The argument about loulity of information
ﬂow is true for any multiple processor machine in which each processor holds a part of an
overall program. However, two factors increase the likelihood that such good decomposition
un be achieved for TDMP. First, since TDMP is based on data ﬂow principles, operations are
constrained only by the availability of data. Tolerance of asynchronous behavior at the architec-
ture level allows greater ﬂexibility in mapping programs. Second, since data ﬂow languages are
free of side eﬂ'ects decomposition is an easier task to perform. The second difference is that
TDMP uses time-division communiution techniques for the transmission and switching of
information packets and Dennis's model is designed for space-division techniques. The
time-division techniques are better in conserving valuable chip area over the space-division
techniques for single-chip VISI implementation. And the use of an intelligent switching net-

work in TDMP allows for the possibility of performing operations other than “plain old switch-

110

ing'. For example, the switch could modify the data as it is being transmitted to the destination
PE or implement some of the system operating functions to improve reliability and diagnosabil-

ity in the system.

CHAPTER V11

CONCLUSIONS

7.! Summary

Advances in improving the computer’s computational speed result from reducing the
switching delay times of elementary logic gates, improving methods for performing primitive
arithmetic operations and designing computer architectures that overcome the intrinsic speed
limitations of the von Neumann machine. While the ﬁrst two approaches improve the compu-
tational speed, the third approach provides the potential for making the largest increase in com-
putational speed. Here, multiprocessors are employed; and these multiprocessor architectures
exploit the inherent parallelism in speciﬁc tasks, thereby eliminating the single-processor
bottleneck -- 'von Neumann bottleneck“. In addition, VLSI is making it feasible to build these
high-performance multiprocessor structures as low-cost, ﬂexible, single-chip system com-
ponents. So, the purpose of the research reported here was to investigate a multiple processor
computer structure that exploits this technology to improve the bandwidth of comparable
single-processor machines and increase the ﬂexibility of existing ﬁxed-array-multiprocessors.
Within the research reported here, the investigation attained several objectives. First, it sur-
veyed the previous work on multiprocessor architectures to identify their principal attributes,
and it suggested how well suited these attributes are for VLSI implementation for enhanced

computation. Desirable multiprocessors VISI attributes include:
— majority of the processors are identical;
— processors and memory all in close proximity;

— simple underlying communiution geometry;

1]]

112

— regular and modular structure;
— few interconnection data paths between processors;
— ﬂexible datapaths.

Next, it presented an architecture called TDMP (Time Division Multiple Processing) and

described how it achieves the following general properties:

— greater bandwidth than comparable single-processor architectures;

— less complex switching network than a crossbar switch interconnection network;
— more ﬂexible than ﬁxed-array network;

— full-access and non-blocking interconnection upabilities;

— simply extends to pipeline operation;

— highly compatible with data ﬂow algorithms;

— amenable to VLSI implementation.

A computer based simulation model of TDMP was developed to investigate the computa-
tional capabilities, hardware components and the interrelationship of these components with the
others. Finally, through two algorithms that represent the important class of recursive algo-
rithms used in digital signal processing, it compared the computational data bandwidth of
TDMP to comparable single-processor and ﬁxed-array-processor systems to show how much
TDMP improves arithmetic computation bandwidth over single-processor systems and how
much bandwidth we must give up to obtain ﬂexibility over ﬁxed-array-processor systems. By
satisfying these objectives, the investigation achieved the overall purpose of the research pro-
ject.

During this investigation the research project achieved several key results. First it

described, from a hardware point of view a basic TDMP architecture with twelve processing

113

elements, two time-division switching networks and a control unit. The processing elements
perform the arithmetic operations and synchronize computation for correct execution at the
architecture-level using data ﬂow principles. The time-division switching networks were shown
to have full-access and non-blocking interconnection upabilities for switching the information
packets and ﬂow control signals among the multiple processors. The switch was also shown to
possess several important attributes that make it well-suited for VLSI implementation. These
attributes include simple underlying communication geometry (time-division multiplexed
buses), modularity regularity and logiul simplicity. The logical simplicity implies that incre-
mental changes in the switch architecture (e.g., increasing the number of processing elements
connected to the switch) the degree to which the changes aﬂ'ects the software in the switch is
small. Next, we explained how TDMP executes programs based on data ﬂow principles in
which computations are allowed to proceed as soon as it operands beume available. The idea
of organizing a computer to operate on data as soon as its operands beume available has been

discussed by Dennis, Gurd and Watson, Arvind, Miller and Cocke.36’37

However, none of
these authors has suggested a detailed and efficient scheme for communicating information
packets to processors for computation. This investigation has shown through simulations that
an integrated interprocessor communiution system using time-division multiplexing and
time-division switching offers an attractive solution to this problem. Also it was shown that

TDMP is unique beuuse time-division communiution techniques have not been previously

investigated in data ﬂow or conventional multiprocessor architectures.

To develop TDMP, we built and tested an APL simulation model of the architecture. The
investigation found that the use of such a model was helpful in the design and analysis of the
architecture. Using the performance data of a single PE TDMP model, it was shown that
TDMP un improve the data bandwidth (computational speed) over single-processor systems by
factors of 15 and 10. when computing FFT‘s and digital ﬁltering results, respectively. The

analysis also indiuted that in order to achieve this improvement the granularity of tasks for

114

each PE must be chosen such that the computation time to communiution time ratio for each
PE is large. The investigation also revealed some negative attributes and deﬁciencies of the
simulation languages available for our simulation purposes. The negative attributes, common
to all of the simulation languages considered in this investigation, result from these languages
not being able to truly simulate concurrent processing or asynchronous events. The reason for
this is these languages are based on a single-processor executing instructions sequentially. The
deﬁciencies of each simulation language varies with the language type. For example, APL does
not have a built in timing simulator but GPSS does. However, GPSS is not particularly suited
to the translation from a hardware representation to a computer program but APL is. The
negative attributes and deﬁciencies of each simulation language considered in this investigation

is discussed in more detail in Chapter 5 of this thesis.

Using the results of this study, designers of multiprocessor systems will be able to create
advance systems with increased computational performance over single-processor systems, a
wider range of use over ﬁxed-array-processor systems and a hardware design that copes with
interconnection data path complexity imposed by VLSI implementation. Second, since TDMP
is based on data ﬂow principals, i.e. operations are constrained only by the availability of data,
the architecture allows greater ﬂexibility in mapping programs and since, data ﬂow languages are
free of side eﬂ'ects decomposition of programs is an easier task to perform. The second results
will aid programmers in developing parallel software to keep the parallel hardware fully occu-
pied. The third result is an APL computer model of the TDMP system which provides a model
for developers of multiprocessor systems to use in investigating the computation bandwidth of
TDMP if some of the parameters of the structure are varied. For example the bandwidth for a
speciﬁc problem can be investigated as the number of time slots, the time slot size and the
number of multiplexed buses is varied. Currently, the model has the ﬂexibility to vary the
number of time slots, the time slot size and the control software for both the PBS and

time-division switch. However, it un not vary the number of time-division multiplexed

115

(TDM) buses. This feature can be implemented by duplicating existing parts of the model and
adding a new time multiplex switch part. This change results in a new time-division switch with
multiple input and output lines. The new switch can switch data between processors on the
same or different time-division multiplex bus. The changing of the number of time slots on a
TDM bus or the number of TDM buses correspond to varying the N and M parameters of the
model, respectively. And ﬁnally, the use of an integrated communiution system based on
time-division techniques has not been previously investigated in multiprocessor structures for
enhanced computation. So the results of this investigation provide designers with a new trajec-

tory for multiprocessor design and implications of using these techniques in those designs.
7.2 Further Research

As with any research project, the investigation reported here points toward several
areas of additional study. Further investigation un be made into showing new options possible
in multiprocessor architectures as a result of using time-division communiution techniques.
For example, since the switching network is a C-move processor, we un now think of ways to
enhance computation by modifying the data as it is being switched. We may also be able to
modify the switch to improve reliability and diagnosability in the system. The research un be
extended into analyzing the data bandwidth, hardware and software requirements of TDMP if
input queues or buffers with arbiters are added to the processing elements. This corresponds to
making the processing elements into multiple input data ﬂow operators. Research into dynami-
cally allouting multiple time slots to PEs un be investigated. This improves the time division
multiplex bus bandwidth utilization beuuse unused time slots uused by idle or non-
communiuting PEs un be allouted to PEs that need to transmit or receive data more fre-
quently. Research into the modularity of TDMP un be further investigated. By employing the
concepts of time multiplex switching and pipelining larger TDMP systems un be developed

from simpler TDMP systems to solve larger problems. And, ﬁnally, further research into the

116

development of higher level programming languages are needed. The underlying trouble with
most current attempts to use parallel hardware is that they are based on traditional concepts of
programming. These concepts in turn are based wholly on the serial von Neumann computer
design, with instructions executed one at a time. In view of the nature of parallel hardware sys-
tems and the practiul difﬁculties of keeping them running at full speed, we suggest further

investigation of data ﬂow programs based on the TDMP computer model.

REFERENCES

REFERENCES

. R. Sugarman, “Superpower Computers,“ IEEE Spectrum, April 1980.

. C. Mead and 1.. Conway, Introduction to VLSI Systems, Addison-Wesley Publishing Com-

10.

12.

13.

pany, Inc., 1980.
K. Hwang, Computer Arithmetic, John Wiley and Sons, Inc., 1979.

S. S. Reddi and E. A. Feustel, “A Conceptual Framework for Computer Architecture,“

Computing Surveys, Vol. 8, No. 2. June 1976.

. A. Mohsen, “Device and Circuit Design for V1.81,“ Caltech Conference on VLSI, January

1979.

G. A. Jullien, “Using ROM Arrays to Implement Computer Arithmetic,“ IEEE Computer

Arithmetic, 1979.

. C. Mead, “V1.81 and Technological Innovation,“ Caltech Conference on VLSI, January

1979.

R. Benard, “Computers: Emphasis on Software,“ IEEE Spectrum, January 1980.
Intel Corporation, 8086 Family User‘s Manual, October 1979.

Intel Corporation, 8087 Co-processor, private communication, May' 1980.

R. Friedenson, “RC Active Filters for the D3 Channel Bank,“ Memorandum for File, Bell

Labs, December 1971.

K. Thurber, “Associative and Parallel Processors,“ Computing Surveys, Vol.7, No.4,

December 1974.

J. P. Hayes, Computer Architecture and Organization, McGraw-Hill Book Company,

pp. 209-236, 1978.

117

14.

15.

16.

I7.

18.

19.

20.

21.

22.

23.

24.

118

W. Wulf and C. Bell, “Cmmp-~A Multi-Mini-Processor,“ Fall Joint Computer Conference,

pp.765-777, 1972.

F. Briggs, K. S. Fu, K. Hwang and J. Patel, “PM4--A Reconﬁgurable Multiprocessor Sys-
tern for Pattern Recognition and Image Processing,“ National Computer Conference,

pp. 225-265, 1979.

W. Wulf and S. Harbison, “Reﬂections in a Pool of Processors--An Experience Report on

Cmmp/Hydra,“ National Computer Conference, pp. 939-950, 1978.

J. R. Jump and S. Ahuja, “Eﬂective Pipelining of Digital Systems,“ IEEE Transactions on

Computers, Vol. C-27, No. 9. September 1978.

C. V. Ramamoorthy and H. F. Li, “Pipeline Architecture,“ Computing Surveys, Vol.9,

No. 1, March 1977.

G. H. Barnes, et al, “The Illiac IV Computer,“ IEEE Transactions on Computers,

Vol. 017, pp. 746-757, August 1968.

K. J. Thurber, “Parallel Processor Architectures,“ Computer Design. pp. 89-114, February

1979.

I(. T. Kung, “Let‘s Design Algorithms for VLSI Systems,“ Caltech Conference on VLSI,

January 1979.

A. L. Davis, “A Data-Driven Machine Architecture Suitable for VLSI Implementation,“

Caltech Conference on VLSI, January 1979.

B. W. Arden and H. Lee, “Analysis of Chordal Ring Network,“ Workshop on Interconnec-

tion Network for Parallel and Distributed Processing, April 1980.

M. A. Franklin, “VLSI Performance Comparison of Baynan and Crossbar Communiutions

Networks,“ Workshop on Interconnection Networks, pp. 20-28, April 1980.

25.

26.

27.

28.

29.

30.

31.

32.

33.

34.

35.

36.

119

F. Briggs, PM4, Private communiution, May 1980.

H. J. Smith and D. S. Smith, “Study of Multistage Interconnection Networks,“ Proceedings

of the Fifth Annual Conference on Computer Architecture, April 1978.

D. M. Dias and J. R. Jump, “Analysis and Simulation of Buﬂ’ered Delta Networks,“

Workshop on Interconnection Networks, April 1980.

M. C. Pease, III, “The Indirect Binary n-Cube Microprocessor Array,“ IEEE Transactions

on Computers, Vol. C-26, No. 5, May 1977.

R. G. Hintz and D. P. Tate, “Control Data STAR-100 Processor Design,“ proceedings of
6th Annual IEEE Computer Society International Conference (COMPCON ’72), San Fran-

cisco, CA, pp. 1-4, September 1972.
R. L. Sites, “An Analysis of the Cray-l Computer,“ Computer Design, January 1979.

M. Satyanarayanan, “Commercial Multiprocessing Systems,“ IEEE Computer Society, May

1980.

A. E. Joel, Jr., “Digital Switching-«How it has Developed,“ IEEE Transactions on Com-

muniutions, Vol. 027, No. 7, July 1979.

H. J. Siege], “Interconnection Networks for SIMD Machines,“ Computer, pp. 57-65, June

1979.

D. Tabak and G. J. Lipovski, “Move Architecture in Digital Controllers,“ IEEE Journal of

Solid State Circuits, Vol. 8015, No. 1, February 1980.

I

J. B. Dennis and D. P. Misunas, “A Preliminary Architecture for a Basic Data Flow Proces-

sor,“ Computer Architecture, pp. 126-132, 1975.

J. Gurd and 1. Watson, “Data Driven System for High Speed Parallel Computing - Part 1:

Structuring Software for Parallel Execution,“ Computer Design, June 1980.

37

38.

39.

41.

42.

43.

45.

46.

120

J. B. Dennis, “First Version of a Data Flow Procedure Language,“ Project MAC, MIT

Cambridge, MA, 1974.
J. Rumbaugh, “A Data Flow Multiprocessor,“ Transactions on Computers, 1977.

J. Gurd and 1. Watson, “Data Driven System for High Speed Parallel Computing - Part 11:

Hardware Design,“ Computer Design, July 1980.

M. Malek and W. W. Myre,“ Figures of Merit for Interconnection Networks,“ Proc. of

Workshop on Interconnection Networks, April 1980.

H. Levin, “Introduction to Computer Analysis: ECAP for Electronic Technicians and

Engineers,“ Prentice-Hall, 1970.
T. J. Schriber, “Simulation Using GPSS,“ Wiley, 1974.

P. A. Bobillier, B. C. Kahan, and A. R. Probst, “Simulation with GPSS and GPSS V,“

Prentice-Hall, 1976.
G. A. Blaauw, “Digital System Implementation,“ Prentice-Hall, Inc., 1976.

L. Gilman and A. J. Rose, “APL An Interactive Approach,“ John Wiley and Sons, Inc.,

1974.

Control Data Corporation, “APL Version 2 Reference Manual,“ 1979.