summaryrefslogtreecommitdiffstats
path: root/src/soc/nvidia/tegra132/spi.c
blob: 6d9fa1f4c1e3dbd923a550aec7673b5c1e608d51 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
/*
 * NVIDIA Tegra SPI controller (T114 and later)
 *
 * Copyright (c) 2010-2013 NVIDIA Corporation
 * Copyright (C) 2013 Google Inc.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; version 2 of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include <arch/cache.h>
#include <arch/io.h>
#include <assert.h>
#include <cbfs.h>
#include <console/console.h>
#include <delay.h>
#include <inttypes.h>
#include <spi-generic.h>
#include <spi_flash.h>
#include <soc/addressmap.h>
#include <soc/dma.h>
#include <soc/spi.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <timer.h>


#if defined(CONFIG_DEBUG_SPI) && CONFIG_DEBUG_SPI
# define DEBUG_SPI(x,...)	printk(BIOS_DEBUG, "TEGRA_SPI: " x)
#else
# define DEBUG_SPI(x,...)
#endif

/*
 * 64 packets in FIFO mode, BLOCK_SIZE packets in DMA mode. Packets can vary
 * in size from 4 to 32 bits. To keep things simple we'll use 8-bit packets.
 */
#define SPI_PACKET_SIZE_BYTES		1
#define SPI_MAX_TRANSFER_BYTES_FIFO	(64 * SPI_PACKET_SIZE_BYTES)
#define SPI_MAX_TRANSFER_BYTES_DMA	(65535 * SPI_PACKET_SIZE_BYTES)

/*
 * This is used to workaround an issue seen where it may take some time for
 * packets to show up in the FIFO after they have been received and the
 * BLOCK_COUNT has been incremented.
 */
#define SPI_FIFO_XFER_TIMEOUT_US	1000

/* COMMAND1 */
#define SPI_CMD1_GO			(1 << 31)
#define SPI_CMD1_M_S			(1 << 30)
#define SPI_CMD1_MODE_MASK		0x3
#define SPI_CMD1_MODE_SHIFT		28
#define SPI_CMD1_CS_SEL_MASK		0x3
#define SPI_CMD1_CS_SEL_SHIFT		26
#define SPI_CMD1_CS_POL_INACTIVE3	(1 << 25)
#define SPI_CMD1_CS_POL_INACTIVE2	(1 << 24)
#define SPI_CMD1_CS_POL_INACTIVE1	(1 << 23)
#define SPI_CMD1_CS_POL_INACTIVE0	(1 << 22)
#define SPI_CMD1_CS_SW_HW		(1 << 21)
#define SPI_CMD1_CS_SW_VAL		(1 << 20)
#define SPI_CMD1_IDLE_SDA_MASK		0x3
#define SPI_CMD1_IDLE_SDA_SHIFT		18
#define SPI_CMD1_BIDIR			(1 << 17)
#define SPI_CMD1_LSBI_FE		(1 << 16)
#define SPI_CMD1_LSBY_FE		(1 << 15)
#define SPI_CMD1_BOTH_EN_BIT		(1 << 14)
#define SPI_CMD1_BOTH_EN_BYTE		(1 << 13)
#define SPI_CMD1_RX_EN			(1 << 12)
#define SPI_CMD1_TX_EN			(1 << 11)
#define SPI_CMD1_PACKED			(1 << 5)
#define SPI_CMD1_BIT_LEN_MASK		0x1f
#define SPI_CMD1_BIT_LEN_SHIFT		0

/* COMMAND2 */
#define SPI_CMD2_TX_CLK_TAP_DELAY	(1 << 6)
#define SPI_CMD2_TX_CLK_TAP_DELAY_MASK	(0x3F << 6)
#define SPI_CMD2_RX_CLK_TAP_DELAY	(1 << 0)
#define SPI_CMD2_RX_CLK_TAP_DELAY_MASK	(0x3F << 0)

/* SPI_TRANS_STATUS */
#define SPI_STATUS_RDY			(1 << 30)
#define SPI_STATUS_SLV_IDLE_COUNT_MASK	0xff
#define SPI_STATUS_SLV_IDLE_COUNT_SHIFT	16
#define SPI_STATUS_BLOCK_COUNT		0xffff
#define SPI_STATUS_BLOCK_COUNT_SHIFT	0

/* SPI_FIFO_STATUS */
#define SPI_FIFO_STATUS_CS_INACTIVE			(1 << 31)
#define SPI_FIFO_STATUS_FRAME_END			(1 << 30)
#define SPI_FIFO_STATUS_RX_FIFO_FULL_COUNT_MASK		0x7f
#define SPI_FIFO_STATUS_RX_FIFO_FULL_COUNT_SHIFT	23
#define SPI_FIFO_STATUS_TX_FIFO_EMPTY_COUNT_MASK	0x7f
#define SPI_FIFO_STATUS_TX_FIFO_EMPTY_COUNT_SHIFT	16
#define SPI_FIFO_STATUS_RX_FIFO_FLUSH			(1 << 15)
#define SPI_FIFO_STATUS_TX_FIFO_FLUSH			(1 << 14)
#define SPI_FIFO_STATUS_ERR				(1 << 8)
#define SPI_FIFO_STATUS_TX_FIFO_OVF			(1 << 7)
#define SPI_FIFO_STATUS_TX_FIFO_UNR			(1 << 6)
#define SPI_FIFO_STATUS_RX_FIFO_OVF			(1 << 5)
#define SPI_FIFO_STATUS_RX_FIFO_UNR			(1 << 4)
#define SPI_FIFO_STATUS_TX_FIFO_FULL			(1 << 3)
#define SPI_FIFO_STATUS_TX_FIFO_EMPTY			(1 << 2)
#define SPI_FIFO_STATUS_RX_FIFO_FULL			(1 << 1)
#define SPI_FIFO_STATUS_RX_FIFO_EMPTY			(1 << 0)

/* SPI_DMA_CTL */
#define SPI_DMA_CTL_DMA			(1 << 31)
#define SPI_DMA_CTL_CONT		(1 << 30)
#define SPI_DMA_CTL_IE_RX		(1 << 29)
#define SPI_DMA_CTL_IE_TX		(1 << 28)
#define SPI_DMA_CTL_RX_TRIG_MASK	0x3
#define SPI_DMA_CTL_RX_TRIG_SHIFT	19
#define SPI_DMA_CTL_TX_TRIG_MASK	0x3
#define SPI_DMA_CTL_TX_TRIG_SHIFT	15

/* SPI_DMA_BLK */
#define SPI_DMA_CTL_BLOCK_SIZE_MASK	0xffff
#define SPI_DMA_CTL_BLOCK_SIZE_SHIFT	0

static struct tegra_spi_channel tegra_spi_channels[] = {
	/*
	 * Note: Tegra pinmux must be setup for corresponding SPI channel in
	 * order for its registers to be accessible. If pinmux has not been
	 * set up, access to the channel's registers will simply hang.
	 *
	 * TODO(dhendrix): Clarify or remove this comment (is clock setup
	 * necessary first, or just pinmux, or both?)
	 */
	{
		.slave = { .bus = 1, },
		.regs = (struct tegra_spi_regs *)TEGRA_SPI1_BASE,
		.req_sel = APBDMA_SLAVE_SL2B1,
	},
	{
		.slave = { .bus = 2, },
		.regs = (struct tegra_spi_regs *)TEGRA_SPI2_BASE,
		.req_sel = APBDMA_SLAVE_SL2B2,
	},
	{
		.slave = { .bus = 3, },
		.regs = (struct tegra_spi_regs *)TEGRA_SPI3_BASE,
		.req_sel = APBDMA_SLAVE_SL2B3,
	},
	{
		.slave = { .bus = 4, },
		.regs = (struct tegra_spi_regs *)TEGRA_SPI4_BASE,
		.req_sel = APBDMA_SLAVE_SL2B4,
	},
	{
		.slave = { .bus = 5, },
		.regs = (struct tegra_spi_regs *)TEGRA_SPI5_BASE,
		.req_sel = APBDMA_SLAVE_SL2B5,
	},
	{
		.slave = { .bus = 6, },
		.regs = (struct tegra_spi_regs *)TEGRA_SPI6_BASE,
		.req_sel = APBDMA_SLAVE_SL2B6,
	},
};

enum spi_direction {
	SPI_SEND,
	SPI_RECEIVE,
};

struct tegra_spi_channel *tegra_spi_init(unsigned int bus)
{
	int i;
	struct tegra_spi_channel *spi = NULL;

	for (i = 0; i < ARRAY_SIZE(tegra_spi_channels); i++) {
		if (tegra_spi_channels[i].slave.bus == bus) {
			spi = &tegra_spi_channels[i];
			break;
		}
	}
	if (!spi)
		return NULL;

	/* software drives chip-select, set value to high */
	setbits_le32(&spi->regs->command1,
			SPI_CMD1_CS_SW_HW | SPI_CMD1_CS_SW_VAL);

	/* 8-bit transfers, unpacked mode, most significant bit first */
	clrbits_le32(&spi->regs->command1,
			SPI_CMD1_BIT_LEN_MASK | SPI_CMD1_PACKED);
	setbits_le32(&spi->regs->command1, 7 << SPI_CMD1_BIT_LEN_SHIFT);

	return spi;
}

static struct tegra_spi_channel * const to_tegra_spi(int bus) {
	return &tegra_spi_channels[bus - 1];
}

static unsigned int tegra_spi_speed(unsigned int bus)
{
	/* FIXME: implement this properly, for now use max value (50MHz) */
	return 50000000;
}

int spi_claim_bus(struct spi_slave *slave)
{
	struct tegra_spi_regs *regs = to_tegra_spi(slave->bus)->regs;
	u32 val;

	tegra_spi_init(slave->bus);

	val = read32(&regs->command1);

	/* select appropriate chip-select line */
	val &= ~(SPI_CMD1_CS_SEL_MASK << SPI_CMD1_CS_SEL_SHIFT);
	val |= (slave->cs << SPI_CMD1_CS_SEL_SHIFT);

	/* drive chip-select with the inverse of the "inactive" value */
	if (val & (SPI_CMD1_CS_POL_INACTIVE0 << slave->cs))
		val &= ~SPI_CMD1_CS_SW_VAL;
	else
		val |= SPI_CMD1_CS_SW_VAL;

	write32(&regs->command1, val);
	return 0;
}

void spi_release_bus(struct spi_slave *slave)
{
	struct tegra_spi_regs *regs = to_tegra_spi(slave->bus)->regs;
	u32 val;

	val = read32(&regs->command1);

	if (val & (SPI_CMD1_CS_POL_INACTIVE0 << slave->cs))
		val |= SPI_CMD1_CS_SW_VAL;
	else
		val &= ~SPI_CMD1_CS_SW_VAL;

	write32(&regs->command1, val);
}

static void dump_fifo_status(struct tegra_spi_channel *spi)
{
	u32 status = read32(&spi->regs->fifo_status);

	printk(BIOS_INFO, "Raw FIFO status: 0x%08x\n", status);
	if (status & SPI_FIFO_STATUS_TX_FIFO_OVF)
		printk(BIOS_INFO, "\tTx overflow detected\n");
	if (status & SPI_FIFO_STATUS_TX_FIFO_UNR)
		printk(BIOS_INFO, "\tTx underrun detected\n");
	if (status & SPI_FIFO_STATUS_RX_FIFO_OVF)
		printk(BIOS_INFO, "\tRx overflow detected\n");
	if (status & SPI_FIFO_STATUS_RX_FIFO_UNR)
		printk(BIOS_INFO, "\tRx underrun detected\n");

	printk(BIOS_INFO, "TX_FIFO: 0x%08x, TX_DATA: 0x%08x\n",
		read32(&spi->regs->tx_fifo), read32(&spi->regs->tx_data));
	printk(BIOS_INFO, "RX_FIFO: 0x%08x, RX_DATA: 0x%08x\n",
		read32(&spi->regs->rx_fifo), read32(&spi->regs->rx_data));
}

static void clear_fifo_status(struct tegra_spi_channel *spi)
{
	clrbits_le32(&spi->regs->fifo_status,
				SPI_FIFO_STATUS_ERR |
				SPI_FIFO_STATUS_TX_FIFO_OVF |
				SPI_FIFO_STATUS_TX_FIFO_UNR |
				SPI_FIFO_STATUS_RX_FIFO_OVF |
				SPI_FIFO_STATUS_RX_FIFO_UNR);
}

static void dump_spi_regs(struct tegra_spi_channel *spi)
{
	printk(BIOS_INFO, "SPI regs:\n"
			"\tdma_blk: 0x%08x\n"
			"\tcommand1: 0x%08x\n"
			"\tdma_ctl: 0x%08x\n"
			"\ttrans_status: 0x%08x\n",
			read32(&spi->regs->dma_blk),
			read32(&spi->regs->command1),
			read32(&spi->regs->dma_ctl),
			read32(&spi->regs->trans_status));
}

static void dump_dma_regs(struct apb_dma_channel *dma)
{
	printk(BIOS_INFO, "DMA regs:\n"
			"\tahb_ptr: 0x%08x\n"
			"\tapb_ptr: 0x%08x\n"
			"\tahb_seq: 0x%08x\n"
			"\tapb_seq: 0x%08x\n"
			"\tcsr: 0x%08x\n"
			"\tcsre: 0x%08x\n"
			"\twcount: 0x%08x\n"
			"\tdma_byte_sta: 0x%08x\n"
			"\tword_transfer: 0x%08x\n",
			read32(&dma->regs->ahb_ptr),
			read32(&dma->regs->apb_ptr),
			read32(&dma->regs->ahb_seq),
			read32(&dma->regs->apb_seq),
			read32(&dma->regs->csr),
			read32(&dma->regs->csre),
			read32(&dma->regs->wcount),
			read32(&dma->regs->dma_byte_sta),
			read32(&dma->regs->word_transfer));
}

static inline unsigned int spi_byte_count(struct tegra_spi_channel *spi)
{
	/* FIXME: Make this take total packet size into account */
	return read32(&spi->regs->trans_status) &
		(SPI_STATUS_BLOCK_COUNT << SPI_STATUS_BLOCK_COUNT_SHIFT);
}

/*
 * This calls udelay() with a calculated value based on the SPI speed and
 * number of bytes remaining to be transferred. It assumes that if the
 * calculated delay period is less than MIN_DELAY_US then it is probably
 * not worth the overhead of yielding.
 */
#define MIN_DELAY_US 250
static void spi_delay(struct tegra_spi_channel *spi,
				unsigned int bytes_remaining)
{
	unsigned int ns_per_byte, delay_us;

	ns_per_byte = 1000000000 / (tegra_spi_speed(spi->slave.bus) / 8);
	delay_us = (ns_per_byte * bytes_remaining) / 1000;

	if (delay_us < MIN_DELAY_US)
		return;

	udelay(delay_us);
}

static void tegra_spi_wait(struct tegra_spi_channel *spi)
{
	unsigned int count, dma_blk;

	dma_blk = 1 + (read32(&spi->regs->dma_blk) &
		(SPI_DMA_CTL_BLOCK_SIZE_MASK << SPI_DMA_CTL_BLOCK_SIZE_SHIFT));

	while ((count = spi_byte_count(spi)) != dma_blk)
		spi_delay(spi, dma_blk - count);
}


static int fifo_error(struct tegra_spi_channel *spi)
{
	return read32(&spi->regs->fifo_status) & SPI_FIFO_STATUS_ERR ? 1 : 0;
}

static int tegra_spi_pio_prepare(struct tegra_spi_channel *spi,
			unsigned int bytes, enum spi_direction dir)
{
	u8 *p = spi->out_buf;
	unsigned int todo = MIN(bytes, SPI_MAX_TRANSFER_BYTES_FIFO);
	u32 flush_mask, enable_mask;

	if (dir == SPI_SEND) {
		flush_mask = SPI_FIFO_STATUS_TX_FIFO_FLUSH;
		enable_mask = SPI_CMD1_TX_EN;
	} else {
		flush_mask = SPI_FIFO_STATUS_RX_FIFO_FLUSH;
		enable_mask = SPI_CMD1_RX_EN;
	}

	setbits_le32(&spi->regs->fifo_status, flush_mask);
	while (read32(&spi->regs->fifo_status) & flush_mask)
		;

	setbits_le32(&spi->regs->command1, enable_mask);

	/* BLOCK_SIZE in SPI_DMA_BLK register applies to both DMA and
	 * PIO transfers */
	write32(&spi->regs->dma_blk, todo - 1);

	if (dir == SPI_SEND) {
		unsigned int to_fifo = bytes;
		while (to_fifo) {
			write32(&spi->regs->tx_fifo, *p);
			p++;
			to_fifo--;
		}
	}

	return todo;
}

static void tegra_spi_pio_start(struct tegra_spi_channel *spi)
{
	setbits_le32(&spi->regs->trans_status, SPI_STATUS_RDY);
	setbits_le32(&spi->regs->command1, SPI_CMD1_GO);
	/* Make sure the write to command1 completes. */
	read32(&spi->regs->command1);
}

static inline u32 rx_fifo_count(struct tegra_spi_channel *spi)
{
	return (read32(&spi->regs->fifo_status) >>
		SPI_FIFO_STATUS_RX_FIFO_FULL_COUNT_SHIFT) &
		SPI_FIFO_STATUS_RX_FIFO_FULL_COUNT_MASK;
}

static int tegra_spi_pio_finish(struct tegra_spi_channel *spi)
{
	u8 *p = spi->in_buf;
	struct stopwatch sw;

	clrbits_le32(&spi->regs->command1, SPI_CMD1_RX_EN | SPI_CMD1_TX_EN);

	/*
	 * Allow some time in case the Rx FIFO does not yet have
	 * all packets pushed into it. See chrome-os-partner:24215.
	 */
	stopwatch_init_usecs_expire(&sw, SPI_FIFO_XFER_TIMEOUT_US);
	do {
		if (rx_fifo_count(spi) == spi_byte_count(spi))
			break;
	} while (!stopwatch_expired(&sw));

	while (!(read32(&spi->regs->fifo_status) &
				SPI_FIFO_STATUS_RX_FIFO_EMPTY)) {
		*p = read8(&spi->regs->rx_fifo);
		p++;
	}

	if (fifo_error(spi)) {
		printk(BIOS_ERR, "%s: ERROR:\n", __func__);
		dump_spi_regs(spi);
		dump_fifo_status(spi);
		return -1;
	}

	return 0;
}

static void setup_dma_params(struct tegra_spi_channel *spi,
				struct apb_dma_channel *dma)
{
	/* APB bus width = 8-bits, address wrap for each word */
	clrbits_le32(&dma->regs->apb_seq,
			APB_BUS_WIDTH_MASK << APB_BUS_WIDTH_SHIFT);
	/* AHB 1 word burst, bus width = 32 bits (fixed in hardware),
	 * no address wrapping */
	clrsetbits_le32(&dma->regs->ahb_seq,
			(AHB_BURST_MASK << AHB_BURST_SHIFT),
			4 << AHB_BURST_SHIFT);

	/* Set ONCE mode to transfer one "block" at a time (64KB) and enable
	 * flow control. */
	clrbits_le32(&dma->regs->csr,
			APB_CSR_REQ_SEL_MASK << APB_CSR_REQ_SEL_SHIFT);
	setbits_le32(&dma->regs->csr, APB_CSR_ONCE | APB_CSR_FLOW |
			(spi->req_sel << APB_CSR_REQ_SEL_SHIFT));
}

static int tegra_spi_dma_prepare(struct tegra_spi_channel *spi,
		unsigned int bytes, enum spi_direction dir)
{
	unsigned int todo, wcount;

	/*
	 * For DMA we need to think of things in terms of word count.
	 * AHB width is fixed at 32-bits. To avoid overrunning
	 * the in/out buffers we must align down. (Note: lowest 2-bits
	 * in WCOUNT register are ignored, and WCOUNT seems to count
	 * words starting at n-1)
	 *
	 * Example: If "bytes" is 7 and we are transferring 1-byte at a time,
	 * WCOUNT should be 4. The remaining 3 bytes must be transferred
	 * using PIO.
	 */
	todo = MIN(bytes, SPI_MAX_TRANSFER_BYTES_DMA - TEGRA_DMA_ALIGN_BYTES);
	todo = ALIGN_DOWN(todo, TEGRA_DMA_ALIGN_BYTES);
	wcount = ALIGN_DOWN(todo - TEGRA_DMA_ALIGN_BYTES, TEGRA_DMA_ALIGN_BYTES);

	if (dir == SPI_SEND) {
		spi->dma_out = dma_claim();
		if (!spi->dma_out)
			return -1;

		/* ensure bytes to send will be visible to DMA controller */
		dcache_clean_by_mva(spi->out_buf, bytes);

		write32(&spi->dma_out->regs->apb_ptr,
			(uintptr_t)&spi->regs->tx_fifo);
		write32(&spi->dma_out->regs->ahb_ptr, (uintptr_t)spi->out_buf);
		setbits_le32(&spi->dma_out->regs->csr, APB_CSR_DIR);
		setup_dma_params(spi, spi->dma_out);
		write32(&spi->dma_out->regs->wcount, wcount);
	} else {
		spi->dma_in = dma_claim();
		if (!spi->dma_in)
			return -1;

		/* avoid data collisions */
		dcache_clean_invalidate_by_mva(spi->in_buf, bytes);

		write32(&spi->dma_in->regs->apb_ptr,
			(uintptr_t)&spi->regs->rx_fifo);
		write32(&spi->dma_in->regs->ahb_ptr, (uintptr_t)spi->in_buf);
		clrbits_le32(&spi->dma_in->regs->csr, APB_CSR_DIR);
		setup_dma_params(spi, spi->dma_in);
		write32(&spi->dma_in->regs->wcount, wcount);
	}

	/* BLOCK_SIZE starts at n-1 */
	write32(&spi->regs->dma_blk, todo - 1);
	return todo;
}

static void tegra_spi_dma_start(struct tegra_spi_channel *spi)
{
	/*
	 * The RDY bit in SPI_TRANS_STATUS needs to be cleared manually
	 * (set bit to clear) between each transaction. Otherwise the next
	 * transaction does not start.
	 */
	setbits_le32(&spi->regs->trans_status, SPI_STATUS_RDY);

	/*
	 * The DMA triggers have units of packets. As each packet is currently
	 * 1 byte the triggers need to be set to 4 packets (0b01) to match
	 * the AHB 32-bit (4 byte) tranfser. Otherwise the FIFO errors can
	 * occur.
	 */
	if (spi->dma_out) {
		clrsetbits_le32(&spi->regs->dma_ctl,
			SPI_DMA_CTL_TX_TRIG_MASK << SPI_DMA_CTL_TX_TRIG_SHIFT,
			1 << SPI_DMA_CTL_TX_TRIG_SHIFT);
		setbits_le32(&spi->regs->command1, SPI_CMD1_TX_EN);
	}
	if (spi->dma_in) {
		clrsetbits_le32(&spi->regs->dma_ctl,
			SPI_DMA_CTL_RX_TRIG_MASK << SPI_DMA_CTL_RX_TRIG_SHIFT,
			1 << SPI_DMA_CTL_RX_TRIG_SHIFT);
		setbits_le32(&spi->regs->command1, SPI_CMD1_RX_EN);
	}

	/*
	 * To avoid underrun conditions, enable APB DMA before SPI DMA for
	 * Tx and enable SPI DMA before APB DMA before Rx.
	 */
	if (spi->dma_out)
		dma_start(spi->dma_out);
	setbits_le32(&spi->regs->dma_ctl, SPI_DMA_CTL_DMA);
	if (spi->dma_in)
		dma_start(spi->dma_in);


}

static int tegra_spi_dma_finish(struct tegra_spi_channel *spi)
{
	int ret;
	unsigned int todo;

	todo = read32(&spi->dma_in->regs->wcount);

	if (spi->dma_in) {
		while ((read32(&spi->dma_in->regs->dma_byte_sta) < todo) ||
				dma_busy(spi->dma_in))
			;	/* this shouldn't take long, no udelay */
		dma_stop(spi->dma_in);
		clrbits_le32(&spi->regs->command1, SPI_CMD1_RX_EN);
		dma_release(spi->dma_in);
	}

	if (spi->dma_out) {
		while ((read32(&spi->dma_out->regs->dma_byte_sta) < todo) ||
				dma_busy(spi->dma_out))
			spi_delay(spi, todo - spi_byte_count(spi));
		clrbits_le32(&spi->regs->command1, SPI_CMD1_TX_EN);
		dma_stop(spi->dma_out);
		dma_release(spi->dma_out);
	}

	if (fifo_error(spi)) {
		printk(BIOS_ERR, "%s: ERROR:\n", __func__);
		dump_dma_regs(spi->dma_out);
		dump_dma_regs(spi->dma_in);
		dump_spi_regs(spi);
		dump_fifo_status(spi);
		ret = -1;
		goto done;
	}

	ret = 0;
done:
	spi->dma_in = NULL;
	spi->dma_out = NULL;
	return ret;
}

/*
 * xfer_setup() prepares a transfer. It does sanity checking, alignment, and
 * sets transfer mode used by this channel (if not set already).
 *
 * A few caveats to watch out for:
 * - The number of bytes which can be transferred may be smaller than the
 *   number of bytes the caller specifies. The number of bytes ready for
 *   a transfer will be returned (unless an error occurs).
 *
 * - Only one mode can be used for both RX and TX. The transfer mode of the
 *   SPI channel (spi->xfer_mode) is checked each time this function is called.
 *   If conflicting modes are detected, spi->xfer_mode will be set to
 *   XFER_MODE_NONE and an error will be returned.
 *
 * Returns bytes ready for transfer if successful, <0 to indicate error.
 */
static int xfer_setup(struct tegra_spi_channel *spi, void *buf,
		unsigned int bytes, enum spi_direction dir)
{
	unsigned int line_size = dcache_line_bytes();
	unsigned int align;
	int ret = -1;

	if (!bytes)
		return 0;

	if (dir == SPI_SEND)
		spi->out_buf = buf;
	else if (dir == SPI_RECEIVE)
		spi->in_buf = buf;

	/*
	 * Alignment consideratons:
	 * When we enable caching we'll need to clean/invalidate portions of
	 * memory. So we need to be careful about memory alignment. Also, DMA
	 * likes to operate on 4-bytes at a time on the AHB side. So for
	 * example, if we only want to receive 1 byte, 4 bytes will be be
	 * written in memory even if those extra 3 bytes are beyond the length
	 * we want.
	 *
	 * For now we'll use PIO to send/receive unaligned bytes. We may
	 * consider setting aside some space for a kind of bounce buffer to
	 * stay in DMA mode once we have a chance to benchmark the two
	 * approaches.
	 */

	if (bytes < line_size) {
		if (spi->xfer_mode == XFER_MODE_DMA) {
			spi->xfer_mode = XFER_MODE_NONE;
			ret = -1;
		} else {
			spi->xfer_mode = XFER_MODE_PIO;
			ret = tegra_spi_pio_prepare(spi, bytes, dir);
		}
		goto done;
	}

	/* transfer bytes before the aligned boundary */
	align = line_size - ((uintptr_t)buf % line_size);
	if ((align != 0) && (align != line_size)) {
		if (spi->xfer_mode == XFER_MODE_DMA) {
			spi->xfer_mode = XFER_MODE_NONE;
			ret = -1;
		} else {
			spi->xfer_mode = XFER_MODE_PIO;
			ret = tegra_spi_pio_prepare(spi, align, dir);
		}
		goto done;
	}

	/* do aligned DMA transfer */
	align = (((uintptr_t)buf + bytes) % line_size);
	if (bytes - align > 0) {
		unsigned int dma_bytes = bytes - align;

		if (spi->xfer_mode == XFER_MODE_PIO) {
			spi->xfer_mode = XFER_MODE_NONE;
			ret = -1;
		} else {
			spi->xfer_mode = XFER_MODE_DMA;
			ret = tegra_spi_dma_prepare(spi, dma_bytes, dir);
		}

		goto done;
	}

	/* transfer any remaining unaligned bytes */
	if (align) {
		if (spi->xfer_mode == XFER_MODE_DMA) {
			spi->xfer_mode = XFER_MODE_NONE;
			ret = -1;
		} else {
			spi->xfer_mode = XFER_MODE_PIO;
			ret = tegra_spi_pio_prepare(spi, align, dir);
		}
		goto done;
	}

done:
	return ret;
}

static void xfer_start(struct tegra_spi_channel *spi)
{
	if (spi->xfer_mode == XFER_MODE_DMA)
		tegra_spi_dma_start(spi);
	else
		tegra_spi_pio_start(spi);
}

static void xfer_wait(struct tegra_spi_channel *spi)
{
	tegra_spi_wait(spi);
}

static int xfer_finish(struct tegra_spi_channel *spi)
{
	int ret;

	if (spi->xfer_mode == XFER_MODE_DMA)
		ret = tegra_spi_dma_finish(spi);
	else
		ret = tegra_spi_pio_finish(spi);

	spi->xfer_mode = XFER_MODE_NONE;
	return ret;
}

unsigned int spi_crop_chunk(unsigned int cmd_len, unsigned int buf_len)
{
	return buf_len;
}

int spi_xfer(struct spi_slave *slave, const void *dout,
		unsigned int out_bytes, void *din, unsigned int in_bytes)
{
	struct tegra_spi_channel *spi = to_tegra_spi(slave->bus);
	u8 *out_buf = (u8 *)dout;
	u8 *in_buf = (u8 *)din;
	unsigned int todo;
	int ret = 0;

	/* tegra bus numbers start at 1 */
	ASSERT(slave->bus >= 1 && slave->bus <= ARRAY_SIZE(tegra_spi_channels));

	while (out_bytes || in_bytes) {
		int x = 0;

		if (out_bytes == 0)
			todo = in_bytes;
		else if (in_bytes == 0)
			todo = out_bytes;
		else
			todo = MIN(out_bytes, in_bytes);

		if (out_bytes) {
			x = xfer_setup(spi, out_buf, todo, SPI_SEND);
			if (x < 0) {
				if (spi->xfer_mode == XFER_MODE_NONE) {
					spi->xfer_mode = XFER_MODE_PIO;
					continue;
				} else {
					ret = -1;
					break;
				}
			}
		}
		if (in_bytes) {
			x = xfer_setup(spi, in_buf, todo, SPI_RECEIVE);
			if (x < 0) {
				if (spi->xfer_mode == XFER_MODE_NONE) {
					spi->xfer_mode = XFER_MODE_PIO;
					continue;
				} else {
					ret = -1;
					break;
				}
			}
		}

		/*
		 * Note: Some devices (such as Chrome EC) are sensitive to
		 * delays, so be careful when adding debug prints not to
		 * cause timeouts between transfers.
		 */
		xfer_start(spi);
		xfer_wait(spi);
		if (xfer_finish(spi)) {
			ret = -1;
			break;
		}

		/* Post-processing. */
		if (out_bytes) {
			out_bytes -= x;
			out_buf += x;
		}
		if (in_bytes) {
			in_bytes -= x;
			in_buf += x;
		}
	}

	if (ret < 0) {
		printk(BIOS_ERR, "%s: Error detected\n", __func__);
		printk(BIOS_ERR, "Transaction size: %u, bytes remaining: "
				"%u out / %u in\n", todo, out_bytes, in_bytes);
		clear_fifo_status(spi);
	}
	return ret;
}

/* SPI as CBFS media. */
struct tegra_spi_media {
	struct spi_slave *slave;
	struct cbfs_simple_buffer buffer;
};

static int tegra_spi_cbfs_open(struct cbfs_media *media)
{
	DEBUG_SPI("tegra_spi_cbfs_open\n");
	return 0;
}

static int tegra_spi_cbfs_close(struct cbfs_media *media)
{
	DEBUG_SPI("tegra_spi_cbfs_close\n");
	return 0;
}

#define JEDEC_READ			0x03
#define JEDEC_READ_OUTSIZE		0x04
#define JEDEC_FAST_READ_DUAL		0x3b
#define JEDEC_FAST_READ_DUAL_OUTSIZE	0x05

static size_t tegra_spi_cbfs_read(struct cbfs_media *media, void *dest,
				   size_t offset, size_t count)
{
	struct tegra_spi_media *spi = (struct tegra_spi_media *)media->context;
	u8 spi_read_cmd[JEDEC_FAST_READ_DUAL_OUTSIZE];
	unsigned int read_cmd_bytes;
	int ret = count;
	struct tegra_spi_channel *channel;

	channel = to_tegra_spi(spi->slave->bus);

	if (channel->dual_mode) {
		/*
		 * Command 0x3b will interleave data only, command 0xbb will
		 * interleave the address as well. It's nice to see the address
		 * plainly when debugging, and we're mostly concerned with
		 * large transfers so the optimization of using 0xbb isn't
		 * really worthwhile.
		 */
		spi_read_cmd[0] = JEDEC_FAST_READ_DUAL;
		spi_read_cmd[4] = 0x00;	/* dummy byte */
		read_cmd_bytes = JEDEC_FAST_READ_DUAL_OUTSIZE;
	} else {
		spi_read_cmd[0] = JEDEC_READ;
		read_cmd_bytes = JEDEC_READ_OUTSIZE;
	}
	spi_read_cmd[1] = (offset >> 16) & 0xff;
	spi_read_cmd[2] = (offset >> 8) & 0xff;
	spi_read_cmd[3] = offset & 0xff;

	spi_claim_bus(spi->slave);

	if (spi_xfer(spi->slave, spi_read_cmd,
			read_cmd_bytes, NULL, 0) < 0) {
		ret = -1;
		printk(BIOS_ERR, "%s: Failed to transfer %zu bytes\n",
				__func__, sizeof(spi_read_cmd));
		goto tegra_spi_cbfs_read_exit;
	}

	if (channel->dual_mode) {
		setbits_le32(&channel->regs->command1, SPI_CMD1_BOTH_EN_BIT);
	}
	if (spi_xfer(spi->slave, NULL, 0, dest, count)) {
		ret = -1;
		printk(BIOS_ERR, "%s: Failed to transfer %zu bytes\n",
				__func__, count);
	}
	if (channel->dual_mode)
		clrbits_le32(&channel->regs->command1, SPI_CMD1_BOTH_EN_BIT);

tegra_spi_cbfs_read_exit:
	/* de-assert /CS */
	spi_release_bus(spi->slave);
	return (ret < 0) ? 0 : ret;
}

static void *tegra_spi_cbfs_map(struct cbfs_media *media, size_t offset,
				 size_t count)
{
	struct tegra_spi_media *spi = (struct tegra_spi_media*)media->context;
	void *map;
	DEBUG_SPI("tegra_spi_cbfs_map\n");
	map = cbfs_simple_buffer_map(&spi->buffer, media, offset, count);
	return map;
}

static void *tegra_spi_cbfs_unmap(struct cbfs_media *media,
				   const void *address)
{
	struct tegra_spi_media *spi = (struct tegra_spi_media*)media->context;
	DEBUG_SPI("tegra_spi_cbfs_unmap\n");
	return cbfs_simple_buffer_unmap(&spi->buffer, address);
}

int initialize_tegra_spi_cbfs_media(struct cbfs_media *media,
				     void *buffer_address,
				     size_t buffer_size)
{
	// TODO Replace static variable to support multiple streams.
	static struct tegra_spi_media context;
	static struct tegra_spi_channel *channel;

	channel = &tegra_spi_channels[CONFIG_BOOT_MEDIA_SPI_BUS - 1];
	channel->slave.cs = CONFIG_BOOT_MEDIA_SPI_CHIP_SELECT;

	DEBUG_SPI("Initializing CBFS media on SPI\n");

	context.slave = &channel->slave;
	context.buffer.allocated = context.buffer.last_allocate = 0;
	context.buffer.buffer = buffer_address;
	context.buffer.size = buffer_size;
	media->context = (void*)&context;
	media->open = tegra_spi_cbfs_open;
	media->close = tegra_spi_cbfs_close;
	media->read = tegra_spi_cbfs_read;
	media->map = tegra_spi_cbfs_map;
	media->unmap = tegra_spi_cbfs_unmap;

#if CONFIG_SPI_FLASH_FAST_READ_DUAL_OUTPUT_3B == 1
	channel->dual_mode = 1;
#endif

	return 0;
}

struct spi_slave *spi_setup_slave(unsigned int bus, unsigned int cs)
{
	struct tegra_spi_channel *channel = to_tegra_spi(bus);
	if (!channel)
		return NULL;

	return &channel->slave;
}