diff -rupN dosbox-PREV//src/hardware/pci_bus.cpp dosbox-RECENT//src/hardware/pci_bus.cpp
--- dosbox-PREV//src/hardware/pci_bus.cpp	2016-01-11 01:47:36 -0500
+++ dosbox-RECENT//src/hardware/pci_bus.cpp	2016-02-29 04:33:47 -0500
@@ -117,7 +117,7 @@ PCI_Device::PCI_Device(Bit16u vendor, Bi
 class PCI_VGADevice:public PCI_Device {
 private:
 	static const Bit16u vendor=0x5333;		// S3
-	static const Bit16u device=0x8811;		// trio64
+	static const Bit16u device=0x5631;		// Virge
 //	static const Bit16u device=0x8810;		// trio32
 public:
 	PCI_VGADevice():PCI_Device(vendor,device) {
diff -rupN dosbox-PREV//src/hardware/vga_memory.cpp dosbox-RECENT//src/hardware/vga_memory.cpp
--- dosbox-PREV//src/hardware/vga_memory.cpp	2016-01-11 20:24:47 -0500
+++ dosbox-RECENT//src/hardware/vga_memory.cpp	2016-03-01 22:45:49 -0500
@@ -1062,7 +1062,7 @@ void VGA_SetupMemory(Section* sec) {
 	vga.svga.bank_read = vga.svga.bank_write = 0;
 	vga.svga.bank_read_full = vga.svga.bank_write_full = 0;
 
-	Bit32u vga_allocsize=vga.vmemsize;
+	Bit32u vga_allocsize= 0x8000000; // was vga.vmemsize ; allows writing beyond fb display
 	// Keep lower limit at 512k
 	if (vga_allocsize<512*1024) vga_allocsize=512*1024;
 
diff -rupN dosbox-PREV//src/hardware/vga_s3.cpp dosbox-RECENT//src/hardware/vga_s3.cpp
--- dosbox-PREV//src/hardware/vga_s3.cpp	2016-01-11 02:29:10 -0500
+++ dosbox-RECENT//src/hardware/vga_s3.cpp	2016-02-29 02:31:04 -0500
@@ -371,9 +371,9 @@ Bitu SVGA_S3_ReadCRTC( Bitu reg, Bitu io
 	case 0x26:
 		return ((vga.attr.disabled & 1)?0x00:0x20) | (vga.attr.index & 0x1f);
 	case 0x2d:	/* Extended Chip ID (high byte of PCI device ID) */
-		return 0x88;
+		return 0x56;	// Virge
 	case 0x2e:	/* New Chip ID  (low byte of PCI device ID) */
-		return 0x11;	// Trio64	
+		return 0x31;	// Virge	
 	case 0x2f:	/* Revision */
 		return 0x00;	// Trio64 (exact value?)
 //		return 0x44;	// Trio64 V+
@@ -580,7 +580,7 @@ void SVGA_Setup_S3Trio(void) {
 	}
 
 	// S3 ROM signature
-	phys_writes(PhysMake(0xc000,0)+0x003f, "S3 86C764", 10);
+	phys_writes(PhysMake(0xc000,0)+0x003f, "S3 86C325", 10);	// Virge
 
 	PCI_AddSVGAS3_Device();
 }
diff -rupN dosbox-PREV//src/hardware/vga_xga.cpp dosbox-RECENT//src/hardware/vga_xga.cpp
--- dosbox-PREV//src/hardware/vga_xga.cpp	2016-03-01 23:21:08 -0500
+++ dosbox-RECENT//src/hardware/vga_xga.cpp	2016-03-01 23:20:51 -0500
@@ -72,6 +72,182 @@ struct XGAStatus {
 
 } xga;
 
+struct s3d {
+	uint32_t src_base;
+	uint32_t dest_base;
+	int clip_l, clip_r, clip_t, clip_b;
+	int dest_str, src_str;
+	uint32_t mono_pat_0;
+	uint32_t mono_pat_1;
+	uint32_t pat_bg_clr;
+	uint32_t pat_fg_clr;
+	uint32_t src_bg_clr;
+	uint32_t src_fg_clr;
+	uint32_t cmd_set;
+	int r_width, r_height;
+	int rsrc_x, rsrc_y;
+	int rdest_x, rdest_y;
+	
+	int lxend0, lxend1;
+	int32_t ldx;
+	uint32_t lxstart, lystart;
+	int lycnt;
+	int line_dir;
+	
+	int src_x, src_y;
+	int dest_x, dest_y;
+	int w, h;
+	uint8_t rop;
+	
+	int data_left_count;
+	uint32_t data_left;
+	
+	uint32_t pattern_8[8*8];
+	uint32_t pattern_16[8*8];
+
+	uint32_t prdx;
+	uint32_t prxstart;
+	uint32_t pldx;
+	uint32_t plxstart;
+	uint32_t pystart;
+	uint32_t pycnt;
+	uint32_t dest_l, dest_r;
+} s3d;
+
+struct s3d_tri
+{
+        uint32_t cmd_set;
+        int clip_l, clip_r, clip_t, clip_b;
+                
+        uint32_t dest_base;
+        uint32_t dest_str;
+        
+        uint32_t z_base;
+        uint32_t z_str;
+
+        uint32_t tex_base;
+        uint32_t tex_bdr_clr;
+        uint32_t tbv, tbu;
+        int32_t TdVdX, TdUdX;
+        int32_t TdVdY, TdUdY;
+        uint32_t tus, tvs;
+
+        int32_t TdZdX, TdZdY;
+        uint32_t tzs;
+
+        int32_t TdWdX, TdWdY;
+        uint32_t tws;
+                
+        int32_t TdDdX, TdDdY;
+        uint32_t tds;
+                
+        int16_t TdGdX, TdBdX, TdRdX, TdAdX;
+        int16_t TdGdY, TdBdY, TdRdY, TdAdY;
+        uint32_t tgs, tbs, trs, tas;
+                                
+        uint32_t TdXdY12;
+        uint32_t txend12;
+        uint32_t TdXdY01;
+        uint32_t txend01;
+        uint32_t TdXdY02;
+        uint32_t txs;
+        uint32_t tys;
+        int ty01, ty12, tlr;
+} s3d_tri;
+
+typedef struct rgba_t
+{
+        int r, g, b, a;
+} rgba_t;
+
+typedef struct s3d_texture_state_t
+{
+        int level;
+        int texture_shift;
+        
+        int32_t u, v;
+} s3d_texture_state_t;
+
+typedef struct s3d_state_t
+{
+        int32_t r, g, b, a, u, v, d, w;
+
+        int32_t base_r, base_g, base_b, base_a, base_u, base_v, base_d, base_w;
+        
+        uint32_t base_z;
+
+        uint32_t tbu, tbv;
+
+        uint32_t cmd_set;
+        int max_d;
+        
+        uint16_t *texture[10];
+        
+        uint32_t tex_bdr_clr;
+        
+        int32_t x1, x2;
+        int y;
+        
+        rgba_t dest_rgba;
+} s3d_state_t;
+
+static int s3d_busy, pixel_count;
+static bool dithering_enabled=true, is_375=false, bilinear_enabled=true;
+
+static int dither[4][4] =
+{
+        0,  4,  1,  5,
+        6,  2,  7,  3,
+        1,  5,  0,  4,
+        7,  3,  6,  2,
+};
+
+enum
+{
+        CMD_SET_AE = 1,
+        CMD_SET_HC = (1 << 1),
+        
+        CMD_SET_FORMAT_MASK = (7 << 2),
+        CMD_SET_FORMAT_8 = (0 << 2),
+        CMD_SET_FORMAT_16 = (1 << 2),
+        
+        CMD_SET_MS = (1 << 6),
+        CMD_SET_IDS = (1 << 7),
+        CMD_SET_MP = (1 << 8),
+        CMD_SET_TP = (1 << 9),
+        
+        CMD_SET_ITA_MASK = (3 << 10),
+        CMD_SET_ITA_BYTE = (0 << 10),
+        CMD_SET_ITA_WORD = (1 << 10),
+        CMD_SET_ITA_DWORD = (2 << 10),
+        
+        CMD_SET_ZUP = (1 << 23),
+        
+        CMD_SET_ZB_MODE = (3 << 24),
+
+        CMD_SET_XP = (1 << 25),
+        CMD_SET_YP = (1 << 26),
+        
+        CMD_SET_COMMAND_MASK = (15 << 27)
+};
+
+#define CMD_SET_ABC_SRC    (1 << 18)
+#define CMD_SET_ABC_ENABLE (1 << 19)
+#define CMD_SET_TWE        (1 << 26)
+
+enum
+{
+        CMD_SET_COMMAND_BITBLT = (0 << 27),
+        CMD_SET_COMMAND_RECTFILL = (2 << 27),
+        CMD_SET_COMMAND_LINE = (3 << 27),
+        CMD_SET_COMMAND_POLY = (5 << 27),
+        CMD_SET_COMMAND_NOP = (15 << 27)
+};
+
+static void s3_virge_bitblt(int count, uint32_t cpu_dat);
+static void s3_virge_triangle();
+static void tri(s3d_state_t *state, int yc, int32_t dx1, int32_t dx2);
+
 void XGA_Write_Multifunc(Bitu val, Bitu len) {
 	Bitu regselect = val >> 12;
 	Bitu dataval = val & 0xfff;
@@ -1035,9 +1211,17 @@ extern void vga_write_p3d5(Bitu port,Bit
 extern Bitu vga_read_p3d5(Bitu port,Bitu iolen);
 
 void XGA_Write(Bitu port, Bitu val, Bitu len) {
-//	LOG_MSG("XGA: Write to port %x, val %8x, len %x", port,val, len);
+	// LOG_MSG("XGA: Write to port %x, val %8x, len %x", port,val, len);
 
-	switch(port) {
+        if ((port & 0xffff) < 0x8000)
+        {
+                if (s3d.cmd_set & CMD_SET_MS)
+                        s3_virge_bitblt(32, ((val & 0xff000000) >> 24) | ((val & 0x00ff0000) >> 8) | ((val & 0x0000ff00) << 8) | ((val & 0x000000ff) << 24));
+                else
+                        s3_virge_bitblt(32, val);
+        }else{
+
+	switch(port & 0xffff) {
 		case 0x8100:// drawing control: row (low word), column (high word)
 					// "CUR_X" and "CUR_Y" (see PORT 82E8h,PORT 86E8h)
 			xga.cury = val & 0x0fff;
@@ -1166,10 +1350,10 @@ void XGA_Write(Bitu port, Bitu val, Bitu
 			XGA_DrawWait(val, len);
 			break;
 		case 0x83d4:
-			if(len==1) vga_write_p3d4(0,val,1);
+			if(len==1) vga_write_p3d4(port & 0x3ff, val, 1); // vga_write_p3d4(0,val,1);	// this change untested
 			else if(len==2) {
-				vga_write_p3d4(0,val&0xff,1);
-				vga_write_p3d5(0,val>>8,1);
+				vga_write_p3d4(port, val, 1); // vga_write_p3d4(0,val&0xff,1);		// this change untested
+				vga_write_p3d5(port + 1, val >> 8, 2); // vga_write_p3d5(0,val>>8,1);	// this change untested
 			}
 			else E_Exit("unimplemented XGA MMIO");
 			break;
@@ -1177,20 +1361,304 @@ void XGA_Write(Bitu port, Bitu val, Bitu
 			if(len==1) vga_write_p3d5(0,val,1);
 			else E_Exit("unimplemented XGA MMIO");
 			break;
+                case 0xa4d4: case 0xa8d4:
+                	s3d.src_base = val & 0x3ffff8;
+                	break;
+                case 0xa4d8: case 0xa8d8:
+                	s3d.dest_base = val & 0x3ffff8;
+                	break;
+                case 0xa4dc: case 0xa8dc:
+                	s3d.clip_l = (val >> 16) & 0x7ff;
+                	s3d.clip_r = val & 0x7ff;
+                	break;
+                case 0xa4e0: case 0xa8e0:
+                	s3d.clip_t = (val >> 16) & 0x7ff;
+                	s3d.clip_b = val & 0x7ff;
+                	break;
+                case 0xa4e4: case 0xa8e4:
+                	s3d.dest_str = (val >> 16) & 0xff8;
+                	s3d.src_str = val & 0xff8;
+                	break;
+                case 0xa4e8: case 0xace8:
+                	s3d.mono_pat_0 = val;
+                	break;
+                case 0xa4ec: case 0xacec:
+                	s3d.mono_pat_1 = val;
+                	break;
+                case 0xa4f0: case 0xacf0:
+                	s3d.pat_bg_clr = val;
+                	break;
+                case 0xa4f4: case 0xa8f4: case 0xacf4:
+                	s3d.pat_fg_clr = val;
+                	break;
+                case 0xa4f8:
+                	s3d.src_bg_clr = val;
+                	break;
+                case 0xa4fc:
+                	s3d.src_fg_clr = val;
+                	break;
+                case 0xa500: case 0xa900:
+                	s3d.cmd_set = val;
+                	if (!(val & CMD_SET_AE)){
+                        	s3_virge_bitblt(-1, 0);
+			}
+	               	break;
+                case 0xa504:
+                	s3d.r_width = (val >> 16) & 0x7ff;
+                	s3d.r_height = val & 0x7ff;
+                	break;
+                case 0xa508:
+                	s3d.rsrc_x = (val >> 16) & 0x7ff;
+                	s3d.rsrc_y = val & 0x7ff;
+                	break;
+                case 0xa50c:
+                	s3d.rdest_x = (val >> 16) & 0x7ff;
+                	s3d.rdest_y = val & 0x7ff;
+                	if (s3d.cmd_set & CMD_SET_AE){
+                       		s3_virge_bitblt(-1, 0);
+			}
+                	break;
+                case 0xa000: case 0xa004: case 0xa008: case 0xa00c:
+                case 0xa010: case 0xa014: case 0xa018: case 0xa01c:
+                case 0xa020: case 0xa024: case 0xa028: case 0xa02c:
+                case 0xa030: case 0xa034: case 0xa038: case 0xa03c:
+                case 0xa040: case 0xa044: case 0xa048: case 0xa04c:
+                case 0xa050: case 0xa054: case 0xa058: case 0xa05c:
+                case 0xa060: case 0xa064: case 0xa068: case 0xa06c:
+                case 0xa070: case 0xa074: case 0xa078: case 0xa07c:
+                case 0xa080: case 0xa084: case 0xa088: case 0xa08c:
+                case 0xa090: case 0xa094: case 0xa098: case 0xa09c:
+                case 0xa0a0: case 0xa0a4: case 0xa0a8: case 0xa0ac:
+                case 0xa0b0: case 0xa0b4: case 0xa0b8: case 0xa0bc:
+                case 0xa0c0: case 0xa0c4: case 0xa0c8: case 0xa0cc:
+                case 0xa0d0: case 0xa0d4: case 0xa0d8: case 0xa0dc:
+                case 0xa0e0: case 0xa0e4: case 0xa0e8: case 0xa0ec:
+                case 0xa0f0: case 0xa0f4: case 0xa0f8: case 0xa0fc:
+                case 0xa100: case 0xa104: case 0xa108: case 0xa10c:
+                case 0xa110: case 0xa114: case 0xa118: case 0xa11c:
+                case 0xa120: case 0xa124: case 0xa128: case 0xa12c:
+                case 0xa130: case 0xa134: case 0xa138: case 0xa13c:
+                case 0xa140: case 0xa144: case 0xa148: case 0xa14c:
+                case 0xa150: case 0xa154: case 0xa158: case 0xa15c:
+                case 0xa160: case 0xa164: case 0xa168: case 0xa16c:
+                case 0xa170: case 0xa174: case 0xa178: case 0xa17c:
+                case 0xa180: case 0xa184: case 0xa188: case 0xa18c:
+                case 0xa190: case 0xa194: case 0xa198: case 0xa19c:
+                case 0xa1a0: case 0xa1a4: case 0xa1a8: case 0xa1ac:
+                case 0xa1b0: case 0xa1b4: case 0xa1b8: case 0xa1bc:
+                case 0xa1c0: case 0xa1c4: case 0xa1c8: case 0xa1cc:
+                case 0xa1d0: case 0xa1d4: case 0xa1d8: case 0xa1dc:
+                case 0xa1e0: case 0xa1e4: case 0xa1e8: case 0xa1ec:
+                case 0xa1f0: case 0xa1f4: case 0xa1f8: case 0xa1fc:
+                {
+                        int x = port & 4;
+                        int y = (port >> 3) & 7;
+                        s3d.pattern_8[y*8 + x]     = val & 0xff;
+                        s3d.pattern_8[y*8 + x + 1] = val >> 8;
+                        s3d.pattern_8[y*8 + x + 2] = val >> 16;
+                        s3d.pattern_8[y*8 + x + 3] = val >> 24;
+                        
+                        x = (port >> 1) & 6;
+                        y = (port >> 4) & 7;
+                        s3d.pattern_16[y*8 + x]     = val & 0xffff;
+                        s3d.pattern_16[y*8 + x + 1] = val >> 16;
+                }
+                break;
+                case 0xa96c:
+                	s3d.lxend0 = (val >> 16) & 0x7ff;
+                	s3d.lxend1 = val & 0x7ff;
+                	break;
+                case 0xa970:
+                	s3d.ldx = (int32_t)val;
+                	break;
+                case 0xa974:
+                	s3d.lxstart = val;
+                	break;
+                case 0xa978:
+                	s3d.lystart = val & 0x7ff;
+                	break;
+                case 0xa97c:
+                	s3d.lycnt = val & 0x7ff;
+                	s3d.line_dir = val >> 31;
+                	if (s3d.cmd_set & CMD_SET_AE)
+                        	s3_virge_bitblt(-1, 0);
+                	break;
+                case 0xad00:
+                	s3d.cmd_set = val;
+                	if (!(val & CMD_SET_AE))
+                        	s3_virge_bitblt(-1, 0);
+                	break;
+                case 0xad68:
+                	s3d.prdx = val;
+                	break;
+                case 0xad6c:
+                	s3d.prxstart = val;
+                	break;
+                case 0xad70:
+                	s3d.pldx = val;
+                	break;
+                case 0xad74:
+               	s3d.plxstart = val;
+                	break;
+                case 0xad78:
+                	s3d.pystart = val & 0x7ff;
+                	break;
+                case 0xad7c:
+                	s3d.pycnt = val & 0x300007ff;
+                	if (s3d.cmd_set & CMD_SET_AE)
+                        	s3_virge_bitblt(-1, 0);
+                	break;
+              	case 0xb4d4:
+			s3d_tri.z_base = val & 0x3ffff8;
+			break;
+                case 0xb4d8:
+			s3d_tri.dest_base = val & 0x3ffff8;
+			break;
+                case 0xb4dc:
+			s3d_tri.clip_l = (val >> 16) & 0x7ff;
+			s3d_tri.clip_r = val & 0x7ff;
+                	break;
+                case 0xb4e0:
+                	s3d_tri.clip_t = (val >> 16) & 0x7ff;
+                	s3d_tri.clip_b = val & 0x7ff;
+                	break;
+                case 0xb4e4:
+                	s3d_tri.dest_str = (val >> 16) & 0xff8;
+                	s3d.src_str = val & 0xff8;
+                	break;
+                case 0xb4e8:
+                	s3d_tri.z_str = val & 0xff8;
+                	break;
+                case 0xb4ec:
+                	s3d_tri.tex_base = val & 0x3ffff8;
+                	break;
+                case 0xb4f0:
+                	s3d_tri.tex_bdr_clr = val & 0xffffff;
+                	break;
+                case 0xb500:
+	                s3d_tri.cmd_set = val;
+        	        if (!(val & CMD_SET_AE))
+                	        s3_virge_triangle();
+                	break;
+                case 0xb504:
+                	s3d_tri.tbv = val & 0xfffff;
+                	break;
+                case 0xb508:
+                	s3d_tri.tbu = val & 0xfffff;
+                	break;
+                case 0xb50c:
+                	s3d_tri.TdWdX = val;
+                	break;
+                case 0xb510:
+                	s3d_tri.TdWdY = val;
+                	break;
+                case 0xb514:
+                	s3d_tri.tws = val;
+                	break;
+                case 0xb518:
+                	s3d_tri.TdDdX = val;
+                	break;
+                case 0xb51c:
+                	s3d_tri.TdVdX = val;
+                	break;
+                case 0xb520:
+                	s3d_tri.TdUdX = val;
+                	break;
+                case 0xb524:
+                	s3d_tri.TdDdY = val;
+                	break;
+                case 0xb528:
+               	s3d_tri.TdVdY = val;
+                	break;
+                case 0xb52c:
+               	s3d_tri.TdUdY = val;
+                	break;
+                case 0xb530:
+                	s3d_tri.tds = val;
+                	break;
+                case 0xb534:
+                	s3d_tri.tvs = val;
+                	break;
+                case 0xb538:
+                	s3d_tri.tus = val;
+                	break;
+                case 0xb53c:
+                	s3d_tri.TdGdX = val >> 16;
+                	s3d_tri.TdBdX = val & 0xffff;
+                	break;
+                case 0xb540:
+                	s3d_tri.TdAdX = val >> 16;
+                	s3d_tri.TdRdX = val & 0xffff;
+               		break;
+                case 0xb544:
+                	s3d_tri.TdGdY = val >> 16;
+                	s3d_tri.TdBdY = val & 0xffff;
+                	break;
+                case 0xb548:
+                	s3d_tri.TdAdY = val >> 16;
+                	s3d_tri.TdRdY = val & 0xffff;
+                	break;
+                case 0xb54c:
+                	s3d_tri.tgs = (val >> 16) & 0xffff;
+                	s3d_tri.tbs = val & 0xffff;
+                	break;
+                case 0xb550:
+                	s3d_tri.tas = (val >> 16) & 0xffff;
+                	s3d_tri.trs = val & 0xffff;
+                	break;
+                case 0xb554:
+                	s3d_tri.TdZdX = val;
+                	break;
+                case 0xb558:
+                	s3d_tri.TdZdY = val;
+                	break;
+                case 0xb55c:
+                	s3d_tri.tzs = val;
+                	break;
+                case 0xb560:
+                	s3d_tri.TdXdY12 = val;
+                	break;
+                case 0xb564:
+                	s3d_tri.txend12 = val;
+                	break;
+                case 0xb568:
+                	s3d_tri.TdXdY01 = val;
+                	break;
+                case 0xb56c:
+                	s3d_tri.txend01 = val;
+                	break;
+                case 0xb570:
+                	s3d_tri.TdXdY02 = val;
+                	break;
+                case 0xb574:
+                	s3d_tri.txs = val;
+                	break;
+                case 0xb578:
+                	s3d_tri.tys = val;
+                	break;
+                case 0xb57c:
+                	s3d_tri.ty01 = (val >> 16) & 0x7ff;
+                	s3d_tri.ty12 = val & 0x7ff;
+                	s3d_tri.tlr = val >> 31;
+                if (s3d_tri.cmd_set & CMD_SET_AE)
+                        s3_virge_triangle();
 		default:
-			if(port <= 0x4000) {
-				//LOG_MSG("XGA: Wrote to port %4x with %08x, len %x", port, val, len);
+			/* if(port <= 0x4000) {
+				// LOG_MSG("XGA: Wrote to port %4x with %08x, len %x", port, val, len);
 				xga.waitcmd.newline = false;
 				XGA_DrawWait(val, len);
 				
 			}
-			else LOG_MSG("XGA: Wrote to port %x with %x, len %x", (int)port, (int)val, (int)len);
+			else LOG_MSG("XGA: Wrote to port %x with %x, len %x", (int)port, (int)val, (int)len); */
 			break;
 	}
+	}
 }
 
 Bitu XGA_Read(Bitu port, Bitu len) {
-	switch(port) {
+	// LOG_MSG("XGA: Read from port %x, len %x", port, len);
+
+	switch(port & 0xffff) {
 		case 0x8118:
 		case 0x9ae8:
 			return 0x400; // nothing busy
@@ -1208,11 +1676,11 @@ Bitu XGA_Read(Bitu port, Bitu len) {
 				break;
 			}
 		case 0x83d4:
-			if(len==1) return vga_read_p3d4(0,0);
+			if(len==1) return vga_read_p3d4(port & 0x3ff, 1); // vga_read_p3d4(0,0);	// this change untested
 			else E_Exit("unimplemented XGA MMIO");
 			break;
 		case 0x83d5:
-			if(len==1) return vga_read_p3d5(0,0);
+			if(len==1) return vga_read_p3d4(port & 0x3ff, 1); // return vga_read_p3d5(0,0);	// this change untested
 			else E_Exit("unimplemented XGA MMIO");
 			break;
 		case 0x9ae9:
@@ -1232,8 +1700,60 @@ Bitu XGA_Read(Bitu port, Bitu len) {
 		case 0xaee8:
 			return XGA_GetDualReg(xga.readmask);
 			break;
+		case 0x8504:
+			if (s3d_busy){
+				return (0x10 << 8);
+			}else{
+				return (0x10 << 8) | (1 << 13);
+			}
+			break;
+                case 0xa4d4: case 0xa8d4:
+               	return s3d.src_base;
+                	break;
+                case 0xa4d8: case 0xa8d8:
+                	return s3d.dest_base;
+                	break;
+                case 0xa4dc: case 0xa8dc:
+                	return (s3d.clip_l << 16) | (s3d.clip_r);
+                	break;
+                case 0xa4e0: case 0xa8e0:
+                	return (s3d.clip_t << 16) | (s3d.clip_b);
+                	break;
+                case 0xa4e4: case 0xa8e4:
+                	return (s3d.dest_str << 16) | (s3d.src_str);
+                	break;
+                case 0xa4e8: case 0xace8:
+                	return s3d.mono_pat_0;
+                	break;
+                case 0xa4ec: case 0xacec:
+                	return s3d.mono_pat_1;
+                	break;
+                case 0xa4f0:
+                	return s3d.pat_bg_clr;
+                	break;
+                case 0xa4f4: case 0xa8f4: case 0xacf4:
+                	return s3d.pat_fg_clr;
+                	break;
+                case 0xa4f8:
+                	return s3d.src_bg_clr;
+                	break;
+                case 0xa4fc:
+                	return s3d.src_fg_clr;
+                	break;
+                case 0xa500:
+                	return s3d.cmd_set;
+               	break;
+                case 0xa504:
+                	return (s3d.r_width << 16) | (s3d.r_height);
+                	break;
+                case 0xa508:
+                	return (s3d.rsrc_x << 16) | (s3d.rsrc_y);
+                	break;
+                case 0xa50c:
+                	return (s3d.rdest_x << 16) | (s3d.rdest_y);
+                	break;
 		default:
-			//LOG_MSG("XGA: Read from port %x, len %x", port, len);
+			LOG_MSG("XGA: Read from port %x, len %x", port, len);
 			break;
 	}
 	return 0xffffffff; 
@@ -1338,3 +1858,1628 @@ void VGA_SetupXGA(void) {
 	IO_RegisterWriteHandler(0xe2ea,&XGA_Write,IO_MB | IO_MW | IO_MD);
 	IO_RegisterReadHandler(0xe2ea,&XGA_Read,IO_MB | IO_MW | IO_MD);
 }
+
+
+/* DEFINITIONS FOR BLITTER */
+
+#define READ(addr, val)                                                                         \
+        {                                                                                       \
+                switch (bpp)                                                                    \
+                {                                                                               \
+                        case 0: /*8 bpp*/                                                       \
+                        val=vga.mem.linear[addr & 0x3fffff];                                	 \
+                        break;                                                                  \
+                        case 1: /*16 bpp*/                                                      \
+                        val=((Bit16u*)(vga.mem.linear))[addr & 0x3fffff];   			 \
+                        break;                                                                  \
+                }                                                                             	 \
+	}  
+
+#define Z_READ(addr) ((Bit16u*)(vga.mem.linear))[addr & 0x3fffff]
+
+#define Z_WRITE(addr, val) if (!(s3d_tri.cmd_set & CMD_SET_ZB_MODE)) ((Bit16u*)(vga.mem.linear))[addr & 0x3fffff] = val
+
+#define CLIP(x, y)                                              \
+        {                                                       \
+                if ((s3d.cmd_set & CMD_SET_HC) &&     \
+                    (x < s3d.clip_l ||                \
+                     x > s3d.clip_r ||                \
+                     y < s3d.clip_t ||                \
+                     y > s3d.clip_b))                 \
+                        update = 0;                   \
+		}          
+
+#define CLIP_3D(x, y)                                          \
+        {                                                      \
+                if ((s3d_tri.cmd_set & CMD_SET_HC) &&        	\
+                    (x < s3d_tri.clip_l ||                   	\
+                     x > s3d_tri.clip_r ||                   	\
+                     y < s3d_tri.clip_t ||                   	\
+                     y > s3d_tri.clip_b))                    	\
+                        update = 0;                            \
+	} 
+
+#define Z_CLIP(Zzb, Zs)                                                 		\
+        {                                                              		\
+                if (!(s3d_tri.cmd_set & CMD_SET_ZB_MODE))            			\
+                switch ((s3d_tri.cmd_set >> 20) & 7)                 			\
+                {                                                       		\
+                        case 0: update = 0; break;                      		\
+                        case 1: if (Zs <= Zzb) update = 0; else Zzb = Zs; break;       	\
+                        case 2: if (Zs != Zzb) update = 0; else Zzb = Zs; break;       	\
+                        case 3: if (Zs <  Zzb) update = 0; else Zzb = Zs; break;       	\
+                        case 4: if (Zs >= Zzb) update = 0; else Zzb = Zs; break;       	\
+                        case 5: if (Zs == Zzb) update = 0; else Zzb = Zs; break;       	\
+                        case 6: if (Zs >  Zzb) update = 0; else Zzb = Zs; break;       	\
+                        case 7: update = 1; Zzb = Zs; break;                      	\
+                }                                                       		\
+	}
+        
+#define MIX()                                                   \
+        {                                                       \
+                int c;                                          \
+                for (c = 0; c < 24; c++)                        \
+                {                                               \
+                        int d = (dest & (1 << c)) ? 1 : 0;      \
+                        if (source & (1 << c))  d |= 2;         \
+                        if (pattern & (1 << c)) d |= 4;         \
+                        if (s3d.rop & (1 << d)) out |= (1 << c);\
+                }						 \
+	}
+
+#define WRITE(addr, val)                                                                        \
+        {                                                                                       \
+                switch (bpp)                                                                    \
+                {                                                                               \
+                        case 0: /*8 bpp*/                                                       \
+			 vga.mem.linear[addr & 0x3fffff]=val;					 \
+                        break;                                                                  \
+                        case 1: /*16 bpp*/                                                      \
+			 ((Bit16u*)(vga.mem.linear))[addr & 0x3fffff]=val; 			 \
+                        break;                                                                  \
+                }										 \
+	}
+
+/* END OF DEFINITIONS FOR BLITTER */
+
+void s3_virge_bitblt(int count, uint32_t cpu_dat)
+{
+        uint32_t mono_pattern[64];
+        int count_mask;
+        int x_inc = (s3d.cmd_set & CMD_SET_XP) ? 1 : -1;
+        int y_inc = (s3d.cmd_set & CMD_SET_YP) ? 1 : -1;
+        int bpp;
+        int x_mul;
+        int cpu_dat_shift;
+        uint32_t *pattern_data;
+        
+        switch (s3d.cmd_set & CMD_SET_FORMAT_MASK)
+        {
+                case CMD_SET_FORMAT_8:
+                bpp = 0;
+                x_mul = 1;
+                cpu_dat_shift = 8;
+                pattern_data = s3d.pattern_8;
+                break;
+                case CMD_SET_FORMAT_16:
+                bpp = 1;
+                x_mul = 2;
+                cpu_dat_shift = 16;
+                pattern_data = s3d.pattern_16;
+                break;
+        }
+        if (s3d.cmd_set & CMD_SET_MP)
+                pattern_data = mono_pattern;
+        
+        switch (s3d.cmd_set & CMD_SET_ITA_MASK)
+        {
+                case CMD_SET_ITA_BYTE:
+                count_mask = ~0x7;
+                break;
+                case CMD_SET_ITA_WORD:
+                count_mask = ~0xf;
+                break;
+                case CMD_SET_ITA_DWORD:
+                default:
+                count_mask = ~0x1f;
+                break;
+        }
+        if (s3d.cmd_set & CMD_SET_MP)
+        {
+                int x, y;
+                for (y = 0; y < 4; y++)
+                {
+                        for (x = 0; x < 8; x++)
+                        {
+                                if (s3d.mono_pat_0 & (1 << (x + y*8)))
+                                        mono_pattern[y*8 + x] = s3d.pat_fg_clr;
+                                else
+                                        mono_pattern[y*8 + x] = s3d.pat_bg_clr;
+                                if (s3d.mono_pat_1 & (1 << (x + y*8)))
+                                        mono_pattern[(y+4)*8 + x] = s3d.pat_fg_clr;
+                                else
+                                        mono_pattern[(y+4)*8 + x] = s3d.pat_bg_clr;
+                        }
+                }
+        }
+        switch (s3d.cmd_set & CMD_SET_COMMAND_MASK)
+        {
+                case CMD_SET_COMMAND_NOP:
+                break;
+                
+                case CMD_SET_COMMAND_BITBLT:
+                if (count == -1)
+                {
+                        s3d.src_x = s3d.rsrc_x;
+                        s3d.src_y = s3d.rsrc_y;
+                        s3d.dest_x = s3d.rdest_x;
+                        s3d.dest_y = s3d.rdest_y;
+                        s3d.w = s3d.r_width;
+                        s3d.h = s3d.r_height;
+                        s3d.rop = (s3d.cmd_set >> 17) & 0xff;
+                        s3d.data_left_count = 0;
+                        
+/*                        LOG_MSG("BitBlt start %i,%i %i,%i %i,%i %02X %x %x\n",
+                                                                 s3d.src_x,
+                                                                 s3d.src_y,
+                                                                 s3d.dest_x,
+                                                                 s3d.dest_y,
+                                                                 s3d.w,
+                                                                 s3d.h,
+                                                                 s3d.rop,
+                                                                 s3d.src_base,
+                                                                 s3d.dest_base);*/
+                        
+                        if (s3d.cmd_set & CMD_SET_IDS)
+                                return;
+                }
+                if (!s3d.h)
+                        return;
+                while (count)
+                {
+                        uint32_t src_addr;
+			 src_addr = s3d.src_base + (s3d.src_x * x_mul) + (s3d.src_y * s3d.src_str);
+                        uint32_t dest_addr;
+			 dest_addr = s3d.dest_base + (s3d.dest_x * x_mul) + (s3d.dest_y * s3d.dest_str);
+                        uint32_t source, dest, pattern;
+                        uint32_t out = 0;
+                        int update = 1;
+
+                        switch (s3d.cmd_set & (CMD_SET_MS | CMD_SET_IDS))
+                        {
+                                case 0:
+                                case CMD_SET_MS:
+                                READ(src_addr, source);
+                                if ((s3d.cmd_set & CMD_SET_TP) && source == s3d.src_fg_clr)
+                                        update = 0;
+                                break;
+                                case CMD_SET_IDS:
+                                if (s3d.data_left_count)
+                                {
+                                        /*Handle shifting for 24-bit data*/
+                                        source = s3d.data_left;
+                                        source |= ((cpu_dat << s3d.data_left_count) & ~0xff000000);
+                                        cpu_dat >>= (cpu_dat_shift - s3d.data_left_count);
+                                        count -= (cpu_dat_shift - s3d.data_left_count);
+                                        s3d.data_left_count = 0;
+                                        if (count < cpu_dat_shift)
+                                        {
+                                                s3d.data_left = cpu_dat;
+                                                s3d.data_left_count = count;
+                                                count = 0;
+                                        }
+                                }
+                                else
+                                {
+                                        source = cpu_dat;
+                                        cpu_dat >>= cpu_dat_shift;
+                                        count -= cpu_dat_shift;
+                                        if (count < cpu_dat_shift)
+                                        {
+                                                s3d.data_left = cpu_dat;
+                                                s3d.data_left_count = count;
+                                                count = 0;
+                                        }
+                                }
+                                if ((s3d.cmd_set & CMD_SET_TP) && source == s3d.src_fg_clr)
+                                        update = 0;
+                                break;
+                                case CMD_SET_IDS | CMD_SET_MS:
+                                source = (cpu_dat & (1 << 31)) ? s3d.src_fg_clr : s3d.src_bg_clr;
+                                if ((s3d.cmd_set & CMD_SET_TP) && !(cpu_dat & (1 << 31)))
+                                        update = 0;
+                                cpu_dat <<= 1;
+                                count--;
+                                break;
+                        }
+
+                        CLIP(s3d.dest_x, s3d.dest_y);
+
+                        if (update)
+                        {
+                                READ(dest_addr, dest);
+                                pattern = pattern_data[(s3d.dest_y & 7)*8 + (s3d.dest_x & 7)];
+                                MIX();
+
+                                WRITE(dest_addr, out);
+                        }
+                
+                        s3d.src_x += x_inc;
+                        s3d.dest_x += x_inc;
+                        if (!s3d.w)
+                        {
+                                s3d.src_x = s3d.rsrc_x;
+                                s3d.dest_x = s3d.rdest_x;
+                                s3d.w = s3d.r_width;
+
+                                s3d.src_y += y_inc;
+                                s3d.dest_y += y_inc;
+                                s3d.h--;
+                                
+                                switch (s3d.cmd_set & (CMD_SET_MS | CMD_SET_IDS))
+                                {
+                                        case CMD_SET_IDS:
+                                        cpu_dat >>= (count - (count & count_mask));
+                                        count &= count_mask;
+                                        s3d.data_left_count = 0;
+                                        break;
+
+                                        case CMD_SET_IDS | CMD_SET_MS:
+                                        cpu_dat <<= (count - (count & count_mask));
+                                        count &= count_mask;
+                                        break;
+                                }
+                                if (!s3d.h)
+                                {
+                                        return;
+                                }
+                        }
+                        else
+                                s3d.w--;                        
+                }
+                break;
+                
+                case CMD_SET_COMMAND_RECTFILL:
+                /*No source, pattern = pat_fg_clr*/
+                if (count == -1)
+                {
+                        s3d.src_x = s3d.rsrc_x;
+                        s3d.src_y = s3d.rsrc_y;
+                        s3d.dest_x = s3d.rdest_x;
+                        s3d.dest_y = s3d.rdest_y;
+                        s3d.w = s3d.r_width;
+                        s3d.h = s3d.r_height;
+                        s3d.rop = (s3d.cmd_set >> 17) & 0xff;
+                        
+/*                        LOG_MSG("RctFll start %i,%i %i,%i %02X %08x\n", s3d.dest_x,
+                                                                 s3d.dest_y,
+                                                                 s3d.w,
+                                                                 s3d.h,
+                                                                 s3d.rop, s3d.dest_base);*/
+                }
+
+                while (count && s3d.h)
+                {
+                        uint32_t dest_addr;
+			 dest_addr = s3d.dest_base + (s3d.dest_x * x_mul) + (s3d.dest_y * s3d.dest_str);
+                        uint32_t source, pattern, out, dest;
+			 int update;
+			 source = 0;
+			 pattern = s3d.pat_fg_clr;
+                        out = 0;
+                        update = 1;
+
+                        CLIP(s3d.dest_x, s3d.dest_y);
+
+                        if (update)
+                        {
+                                READ(dest_addr, dest);
+
+                                MIX();
+
+                                WRITE(dest_addr, out);
+                        }
+
+                        s3d.src_x += x_inc;
+                        s3d.dest_x += x_inc;
+                        if (!s3d.w)
+                        {
+                                s3d.src_x = s3d.rsrc_x;
+                                s3d.dest_x = s3d.rdest_x;
+                                s3d.w = s3d.r_width;
+
+                                s3d.src_y += y_inc;
+                                s3d.dest_y += y_inc;
+                                s3d.h--;
+                                if (!s3d.h)
+                                {
+                                        return;
+                                }
+                        }
+                        else
+                                s3d.w--;                        
+                        count--;
+                }
+                break;
+                
+                case CMD_SET_COMMAND_LINE:
+                if (count == -1)
+                {
+                        s3d.dest_x = s3d.lxstart;
+                        s3d.dest_y = s3d.lystart;
+                        s3d.h = s3d.lycnt;
+                        s3d.rop = (s3d.cmd_set >> 17) & 0xff;
+                }
+                while (s3d.h)
+                {
+                        int x;
+                        int new_x;
+                        int first_pixel;
+			 first_pixel = 1;
+                        
+                        x = s3d.dest_x >> 20;
+
+                        if (s3d.h == s3d.lycnt &&
+                           ((s3d.line_dir && x > s3d.lxend0) ||
+                           (!s3d.line_dir && x < s3d.lxend0)))
+                                x = s3d.lxend0;
+
+                        if (s3d.h == 1)
+                                new_x = s3d.lxend1 + (s3d.line_dir ? 1 : -1);
+                        else
+                                new_x = (s3d.dest_x + s3d.ldx) >> 20;
+
+                        
+                        if ((s3d.line_dir && x > new_x) ||
+                            (!s3d.line_dir && x < new_x))
+                                goto skip_line;
+                                
+                        do
+                        {
+                                uint32_t dest_addr;
+				 dest_addr = s3d.dest_base + (x * x_mul) + (s3d.dest_y * s3d.dest_str);
+                                uint32_t source, dest, pattern, out;
+				 source = 0;
+                                out = 0;
+                                int update;
+				 update = 1;
+
+                                if ((s3d.h == s3d.lycnt || !first_pixel) &&
+                                   ((s3d.line_dir && x < s3d.lxend0) ||
+                                   (!s3d.line_dir && x > s3d.lxend0)))
+                                        update = 0;
+
+                                if ((s3d.h == 1  || !first_pixel) &&
+                                   ((s3d.line_dir && x > s3d.lxend1) ||
+                                   (!s3d.line_dir && x < s3d.lxend1)))
+                                        update = 0;
+
+                                CLIP(x, s3d.dest_y);
+
+                                if (update)
+                                {
+                                        READ(dest_addr, dest);
+                                        pattern = s3d.pat_fg_clr;
+
+                                        MIX();
+
+                                        WRITE(dest_addr, out);
+                                }
+                                
+                                if (x < new_x)
+                                        x++;
+                                else if (x > new_x)
+                                        x--;
+                                first_pixel = 0;
+                        } while (x != new_x);
+
+skip_line:
+                        s3d.dest_x += s3d.ldx;
+                        s3d.dest_y--;
+                        s3d.h--;
+                }
+                break;
+
+                case CMD_SET_COMMAND_POLY:
+                /*No source*/
+                if (s3d.pycnt & (1 << 28))
+                        s3d.dest_r = s3d.prxstart;
+                if (s3d.pycnt & (1 << 29))
+                        s3d.dest_l = s3d.plxstart;
+                s3d.h = s3d.pycnt & 0x7ff;
+                s3d.rop = (s3d.cmd_set >> 17) & 0xff;
+                //LOG_MSG("Start poly - l=%08x r=%08x h=%i rop=%02x\n", s3d.dest_l, s3d.dest_r, s3d.h, s3d.rop);
+                while (s3d.h)
+                {
+                        int x, xend, y, xdir;
+			 x = s3d.dest_l >> 20;
+                        xend = s3d.dest_r >> 20;
+                        y = s3d.pystart & 0x7ff;
+                        xdir = (x < xend) ? 1 : -1;
+                        //LOG_MSG(" %03i: %i - %i  %08x-%08x\n", y, x, xend, s3d.dest_l, s3d.dest_r);
+                        do
+                        {
+                                uint32_t dest_addr;
+				 dest_addr = s3d.dest_base + (x * x_mul) + (y * s3d.dest_str);
+                                uint32_t source, dest, pattern, out;
+				 source = 0;
+                                out = 0;
+                                int update;
+				 update = 1;
+
+                                CLIP(x, y);
+
+                                if (update)
+                                {
+                                        READ(dest_addr, dest);
+                                        pattern = pattern_data[(y & 7)*8 + (x & 7)];
+                                        MIX();
+
+                                        WRITE(dest_addr, out);
+                                }
+                                
+                                x = (x + xdir) & 0x7ff;
+                        }
+                        while (x != (xend + xdir));
+
+                        s3d.dest_l += s3d.pldx;
+                        s3d.dest_r += s3d.prdx;
+                        s3d.h--;
+                        s3d.pystart = (s3d.pystart - 1) & 0x7ff;
+                }
+                break;
+
+                default:
+                E_Exit("s3_virge_bitblt : blit command %i %08x\n", (s3d.cmd_set >> 27) & 0xf, s3d.cmd_set);
+        }
+}
+
+
+#define RGB15_TO_24(val, r, g, b) b = ((val & 0x001f) << 3) | ((val & 0x001f) >> 2);     \
+                                  g = ((val & 0x03e0) >> 2) | ((val & 0x03e0) >> 7);     \
+                                  r = ((val & 0x7c00) >> 7) | ((val & 0x7c00) >> 12);
+
+#define RGB24_TO_24(val, r, g, b) b = val & 0xff;             \
+                                  g = (val & 0xff00) >> 8;    \
+                                  r = (val & 0xff0000) >> 16
+
+#define RGB15(r, g, b, dest) \
+        if (dithering_enabled)                           \
+        {                                                       \
+                int add = dither[_y & 3][_x & 3];               \
+                int _r = (r > 248) ? 248 : r+add;               \
+                int _g = (g > 248) ? 248 : g+add;               \
+                int _b = (b > 248) ? 248 : b+add;               \
+                dest = ((_b >> 3) & 0x1f) | (((_g >> 3) & 0x1f) << 5) | (((_r >> 3) & 0x1f) << 10);     \
+        }                                                                                               \
+        else                                                                                            \
+                dest = ((b >> 3) & 0x1f) | (((g >> 3) & 0x1f) << 5) | (((r >> 3) & 0x1f) << 10)
+
+#define RGB24(r, g, b) ((b) | ((g) << 8) | ((r) << 16))
+
+static void (*tex_read)(s3d_state_t *state, s3d_texture_state_t *texture_state, rgba_t *out);
+static void (*tex_sample)(s3d_state_t *state);
+static void (*dest_pixel)(s3d_state_t *state);
+
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+
+static int _x, _y;
+
+static void tex_ARGB1555(s3d_state_t *state, s3d_texture_state_t *texture_state, rgba_t *out)
+{
+        int offset = ((texture_state->u & 0x7fc0000) >> texture_state->texture_shift) +
+                     (((texture_state->v & 0x7fc0000) >> texture_state->texture_shift) << texture_state->level);
+        uint16_t val = state->texture[texture_state->level][offset];
+
+        out->r = ((val & 0x7c00) >> 7) | ((val & 0x7000) >> 12);
+        out->g = ((val & 0x03e0) >> 2) | ((val & 0x0380) >> 7);
+        out->b = ((val & 0x001f) << 3) | ((val & 0x001c) >> 2);
+        out->a = (val & 0x8000) ? 0xff : 0;
+}
+
+static void tex_ARGB1555_nowrap(s3d_state_t *state, s3d_texture_state_t *texture_state, rgba_t *out)
+{
+        int offset = ((texture_state->u & 0x7fc0000) >> texture_state->texture_shift) +
+                     (((texture_state->v & 0x7fc0000) >> texture_state->texture_shift) << texture_state->level);
+        uint16_t val = state->texture[texture_state->level][offset];
+
+        if (((texture_state->u | texture_state->v) & 0xf8000000) == 0xf8000000)
+                val = state->tex_bdr_clr;
+
+        out->r = ((val & 0x7c00) >> 7) | ((val & 0x7000) >> 12);
+        out->g = ((val & 0x03e0) >> 2) | ((val & 0x0380) >> 7);
+        out->b = ((val & 0x001f) << 3) | ((val & 0x001c) >> 2);
+        out->a = (val & 0x8000) ? 0xff : 0;
+}
+
+static void tex_ARGB4444(s3d_state_t *state, s3d_texture_state_t *texture_state, rgba_t *out)
+{
+        int offset = ((texture_state->u & 0x7fc0000) >> texture_state->texture_shift) +
+                     (((texture_state->v & 0x7fc0000) >> texture_state->texture_shift) << texture_state->level);
+        uint16_t val = state->texture[texture_state->level][offset];
+
+        out->r = ((val & 0x0f00) >> 4) | ((val & 0x0f00) >> 8);
+        out->g = (val & 0x00f0) | ((val & 0x00f0) >> 4);
+        out->b = ((val & 0x000f) << 4) | (val & 0x000f);
+        out->a = ((val & 0xf000) >> 8) | ((val & 0xf000) >> 12);
+}
+
+static void tex_ARGB4444_nowrap(s3d_state_t *state, s3d_texture_state_t *texture_state, rgba_t *out)
+{
+        int offset = ((texture_state->u & 0x7fc0000) >> texture_state->texture_shift) +
+                     (((texture_state->v & 0x7fc0000) >> texture_state->texture_shift) << texture_state->level);
+        uint16_t val = state->texture[texture_state->level][offset];
+
+        if (((texture_state->u | texture_state->v) & 0xf8000000) == 0xf8000000)
+                val = state->tex_bdr_clr;
+
+        out->r = ((val & 0x0f00) >> 4) | ((val & 0x0f00) >> 8);
+        out->g = (val & 0x00f0) | ((val & 0x00f0) >> 4);
+        out->b = ((val & 0x000f) << 4) | (val & 0x000f);
+        out->a = ((val & 0xf000) >> 8) | ((val & 0xf000) >> 12);
+}
+
+static void tex_ARGB8888(s3d_state_t *state, s3d_texture_state_t *texture_state, rgba_t *out)
+{
+        int offset = ((texture_state->u & 0x7fc0000) >> texture_state->texture_shift) +
+                     (((texture_state->v & 0x7fc0000) >> texture_state->texture_shift) << texture_state->level);
+        uint32_t val = ((uint32_t *)state->texture[texture_state->level])[offset];
+
+        out->r = (val >> 16) & 0xff;
+        out->g = (val >> 8)  & 0xff;
+        out->b =  val        & 0xff;
+        out->a = (val >> 24) & 0xff;
+}
+static void tex_ARGB8888_nowrap(s3d_state_t *state, s3d_texture_state_t *texture_state, rgba_t *out)
+{
+        int offset = ((texture_state->u & 0x7fc0000) >> texture_state->texture_shift) +
+                     (((texture_state->v & 0x7fc0000) >> texture_state->texture_shift) << texture_state->level);
+        uint32_t val = ((uint32_t *)state->texture[texture_state->level])[offset];
+
+        if (((texture_state->u | texture_state->v) & 0xf8000000) == 0xf8000000)
+                val = state->tex_bdr_clr;
+
+        out->r = (val >> 16) & 0xff;
+        out->g = (val >> 8)  & 0xff;
+        out->b =  val        & 0xff;
+        out->a = (val >> 24) & 0xff;
+}
+
+static void tex_sample_normal(s3d_state_t *state)
+{
+        s3d_texture_state_t texture_state;
+        
+        texture_state.level = state->max_d;
+        texture_state.texture_shift = 18 + (9 - texture_state.level);
+        texture_state.u = state->u + state->tbu;
+        texture_state.v = state->v + state->tbv;
+
+        tex_read(state, &texture_state, &state->dest_rgba);
+}
+
+static void tex_sample_normal_filter(s3d_state_t *state)
+{
+        s3d_texture_state_t texture_state;
+        int tex_offset;
+        rgba_t tex_samples[4];
+        int du, dv;
+        int d[4];
+
+        texture_state.level = state->max_d;
+        texture_state.texture_shift = 18 + (9 - texture_state.level);
+        tex_offset = 1 << texture_state.texture_shift;
+
+        texture_state.u = state->u + state->tbu;
+        texture_state.v = state->v + state->tbv;
+        tex_read(state, &texture_state, &tex_samples[0]);
+        du = (texture_state.u >> (texture_state.texture_shift - 8)) & 0xff;
+        dv = (texture_state.v >> (texture_state.texture_shift - 8)) & 0xff;
+
+        texture_state.u = state->u + state->tbu + tex_offset;
+        texture_state.v = state->v + state->tbv;
+        tex_read(state, &texture_state, &tex_samples[1]);
+
+        texture_state.u = state->u + state->tbu;
+        texture_state.v = state->v + state->tbv + tex_offset;
+        tex_read(state, &texture_state, &tex_samples[2]);
+
+        texture_state.u = state->u + state->tbu + tex_offset;
+        texture_state.v = state->v + state->tbv + tex_offset;
+        tex_read(state, &texture_state, &tex_samples[3]);
+        
+        d[0] = (256 - du) * (256 - dv);
+        d[1] =  du * (256 - dv);
+        d[2] = (256 - du) * dv;
+        d[3] = du * dv;
+        
+        state->dest_rgba.r = (tex_samples[0].r * d[0] + tex_samples[1].r * d[1] + tex_samples[2].r * d[2] + tex_samples[3].r * d[3]) >> 16;
+        state->dest_rgba.g = (tex_samples[0].g * d[0] + tex_samples[1].g * d[1] + tex_samples[2].g * d[2] + tex_samples[3].g * d[3]) >> 16;
+        state->dest_rgba.b = (tex_samples[0].b * d[0] + tex_samples[1].b * d[1] + tex_samples[2].b * d[2] + tex_samples[3].b * d[3]) >> 16;
+        state->dest_rgba.a = (tex_samples[0].a * d[0] + tex_samples[1].a * d[1] + tex_samples[2].a * d[2] + tex_samples[3].a * d[3]) >> 16;
+}
+
+static void tex_sample_mipmap(s3d_state_t *state)
+{
+        s3d_texture_state_t texture_state;
+
+        texture_state.level = (state->d < 0) ? state->max_d : state->max_d - ((state->d >> 27) & 0xf);
+        if (texture_state.level < 0)
+                texture_state.level = 0;
+        texture_state.texture_shift = 18 + (9 - texture_state.level);
+        texture_state.u = state->u + state->tbu;
+        texture_state.v = state->v + state->tbv;
+
+        tex_read(state, &texture_state, &state->dest_rgba);
+}
+
+static void tex_sample_mipmap_filter(s3d_state_t *state)
+{
+        s3d_texture_state_t texture_state;
+        int tex_offset;
+        rgba_t tex_samples[4];
+        int du, dv;
+        int d[4];
+
+        texture_state.level = (state->d < 0) ? state->max_d : state->max_d - ((state->d >> 27) & 0xf);
+        if (texture_state.level < 0)
+                texture_state.level = 0;
+        texture_state.texture_shift = 18 + (9 - texture_state.level);
+        tex_offset = 1 << texture_state.texture_shift;
+        
+        texture_state.u = state->u + state->tbu;
+        texture_state.v = state->v + state->tbv;
+        tex_read(state, &texture_state, &tex_samples[0]);
+        du = (texture_state.u >> (texture_state.texture_shift - 8)) & 0xff;
+        dv = (texture_state.v >> (texture_state.texture_shift - 8)) & 0xff;
+
+        texture_state.u = state->u + state->tbu + tex_offset;
+        texture_state.v = state->v + state->tbv;
+        tex_read(state, &texture_state, &tex_samples[1]);
+
+        texture_state.u = state->u + state->tbu;
+        texture_state.v = state->v + state->tbv + tex_offset;
+        tex_read(state, &texture_state, &tex_samples[2]);
+
+        texture_state.u = state->u + state->tbu + tex_offset;
+        texture_state.v = state->v + state->tbv + tex_offset;
+        tex_read(state, &texture_state, &tex_samples[3]);
+
+        d[0] = (256 - du) * (256 - dv);
+        d[1] =  du * (256 - dv);
+        d[2] = (256 - du) * dv;
+        d[3] = du * dv;
+        
+        state->dest_rgba.r = (tex_samples[0].r * d[0] + tex_samples[1].r * d[1] + tex_samples[2].r * d[2] + tex_samples[3].r * d[3]) >> 16;
+        state->dest_rgba.g = (tex_samples[0].g * d[0] + tex_samples[1].g * d[1] + tex_samples[2].g * d[2] + tex_samples[3].g * d[3]) >> 16;
+        state->dest_rgba.b = (tex_samples[0].b * d[0] + tex_samples[1].b * d[1] + tex_samples[2].b * d[2] + tex_samples[3].b * d[3]) >> 16;
+        state->dest_rgba.a = (tex_samples[0].a * d[0] + tex_samples[1].a * d[1] + tex_samples[2].a * d[2] + tex_samples[3].a * d[3]) >> 16;
+}
+
+static void tex_sample_persp_normal(s3d_state_t *state)
+{
+        s3d_texture_state_t texture_state;
+        int32_t w = 0;
+
+        if (state->w)
+                w = (int32_t)(((1ULL << 27) << 19) / (int64_t)state->w);
+        
+        texture_state.level = state->max_d;
+        texture_state.texture_shift = 18 + (9 - texture_state.level);      
+        texture_state.u = (int32_t)(((int64_t)state->u * (int64_t)w) >> (12 + state->max_d)) + state->tbu;
+        texture_state.v = (int32_t)(((int64_t)state->v * (int64_t)w) >> (12 + state->max_d)) + state->tbv;
+
+        tex_read(state, &texture_state, &state->dest_rgba);
+}
+
+static void tex_sample_persp_normal_filter(s3d_state_t *state)
+{
+        s3d_texture_state_t texture_state;
+        int32_t w = 0, u, v;
+        int tex_offset;
+        rgba_t tex_samples[4];
+        int du, dv;
+        int d[4];
+
+        if (state->w)
+                w = (int32_t)(((1ULL << 27) << 19) / (int64_t)state->w);
+
+        u = (int32_t)(((int64_t)state->u * (int64_t)w) >> (12 + state->max_d)) + state->tbu;
+        v = (int32_t)(((int64_t)state->v * (int64_t)w) >> (12 + state->max_d)) + state->tbv;
+
+        texture_state.level = state->max_d;
+        texture_state.texture_shift = 18 + (9 - texture_state.level);
+        tex_offset = 1 << texture_state.texture_shift;
+        
+        texture_state.u = u;
+        texture_state.v = v;
+        tex_read(state, &texture_state, &tex_samples[0]);
+        du = (u >> (texture_state.texture_shift - 8)) & 0xff;
+        dv = (v >> (texture_state.texture_shift - 8)) & 0xff;
+
+        texture_state.u = u + tex_offset;
+        texture_state.v = v;
+        tex_read(state, &texture_state, &tex_samples[1]);
+
+        texture_state.u = u;
+        texture_state.v = v + tex_offset;
+        tex_read(state, &texture_state, &tex_samples[2]);
+
+        texture_state.u = u + tex_offset;
+        texture_state.v = v + tex_offset;
+        tex_read(state, &texture_state, &tex_samples[3]);
+
+        d[0] = (256 - du) * (256 - dv);
+        d[1] =  du * (256 - dv);
+        d[2] = (256 - du) * dv;
+        d[3] = du * dv;
+        
+        state->dest_rgba.r = (tex_samples[0].r * d[0] + tex_samples[1].r * d[1] + tex_samples[2].r * d[2] + tex_samples[3].r * d[3]) >> 16;
+        state->dest_rgba.g = (tex_samples[0].g * d[0] + tex_samples[1].g * d[1] + tex_samples[2].g * d[2] + tex_samples[3].g * d[3]) >> 16;
+        state->dest_rgba.b = (tex_samples[0].b * d[0] + tex_samples[1].b * d[1] + tex_samples[2].b * d[2] + tex_samples[3].b * d[3]) >> 16;
+        state->dest_rgba.a = (tex_samples[0].a * d[0] + tex_samples[1].a * d[1] + tex_samples[2].a * d[2] + tex_samples[3].a * d[3]) >> 16;
+}
+
+static void tex_sample_persp_normal_375(s3d_state_t *state)
+{
+        s3d_texture_state_t texture_state;
+        int32_t w = 0;
+
+        if (state->w)
+                w = (int32_t)(((1ULL << 27) << 19) / (int64_t)state->w);
+        
+        texture_state.level = state->max_d;
+        texture_state.texture_shift = 18 + (9 - texture_state.level);      
+        texture_state.u = (int32_t)(((int64_t)state->u * (int64_t)w) >> (8 + state->max_d)) + state->tbu;
+        texture_state.v = (int32_t)(((int64_t)state->v * (int64_t)w) >> (8 + state->max_d)) + state->tbv;
+
+        tex_read(state, &texture_state, &state->dest_rgba);
+}
+
+static void tex_sample_persp_normal_filter_375(s3d_state_t *state)
+{
+        s3d_texture_state_t texture_state;
+        int32_t w = 0, u, v;
+        int tex_offset;
+        rgba_t tex_samples[4];
+        int du, dv;
+        int d[4];
+
+        if (state->w)
+                w = (int32_t)(((1ULL << 27) << 19) / (int64_t)state->w);
+
+        u = (int32_t)(((int64_t)state->u * (int64_t)w) >> (8 + state->max_d)) + state->tbu;
+        v = (int32_t)(((int64_t)state->v * (int64_t)w) >> (8 + state->max_d)) + state->tbv;
+        
+        texture_state.level = state->max_d;
+        texture_state.texture_shift = 18 + (9 - texture_state.level);
+        tex_offset = 1 << texture_state.texture_shift;
+
+        texture_state.u = u;
+        texture_state.v = v;
+        tex_read(state, &texture_state, &tex_samples[0]);
+        du = (u >> (texture_state.texture_shift - 8)) & 0xff;
+        dv = (v >> (texture_state.texture_shift - 8)) & 0xff;
+
+        texture_state.u = u + tex_offset;
+        texture_state.v = v;
+        tex_read(state, &texture_state, &tex_samples[1]);
+
+        texture_state.u = u;
+        texture_state.v = v + tex_offset;
+        tex_read(state, &texture_state, &tex_samples[2]);
+
+        texture_state.u = u + tex_offset;
+        texture_state.v = v + tex_offset;
+        tex_read(state, &texture_state, &tex_samples[3]);
+
+        d[0] = (256 - du) * (256 - dv);
+        d[1] =  du * (256 - dv);
+        d[2] = (256 - du) * dv;
+        d[3] = du * dv;
+        
+        state->dest_rgba.r = (tex_samples[0].r * d[0] + tex_samples[1].r * d[1] + tex_samples[2].r * d[2] + tex_samples[3].r * d[3]) >> 16;
+        state->dest_rgba.g = (tex_samples[0].g * d[0] + tex_samples[1].g * d[1] + tex_samples[2].g * d[2] + tex_samples[3].g * d[3]) >> 16;
+        state->dest_rgba.b = (tex_samples[0].b * d[0] + tex_samples[1].b * d[1] + tex_samples[2].b * d[2] + tex_samples[3].b * d[3]) >> 16;
+        state->dest_rgba.a = (tex_samples[0].a * d[0] + tex_samples[1].a * d[1] + tex_samples[2].a * d[2] + tex_samples[3].a * d[3]) >> 16;
+}
+
+
+static void tex_sample_persp_mipmap(s3d_state_t *state)
+{
+        s3d_texture_state_t texture_state;
+        int32_t w = 0;
+
+        if (state->w)
+                w = (int32_t)(((1ULL << 27) << 19) / (int64_t)state->w);
+        
+        texture_state.level = (state->d < 0) ? state->max_d : state->max_d - ((state->d >> 27) & 0xf);
+        if (texture_state.level < 0)
+                texture_state.level = 0;
+        texture_state.texture_shift = 18 + (9 - texture_state.level);
+        texture_state.u = (int32_t)(((int64_t)state->u * (int64_t)w) >> (12 + state->max_d)) + state->tbu;
+        texture_state.v = (int32_t)(((int64_t)state->v * (int64_t)w) >> (12 + state->max_d)) + state->tbv;
+
+        tex_read(state, &texture_state, &state->dest_rgba);
+}
+
+static void tex_sample_persp_mipmap_filter(s3d_state_t *state)
+{
+        s3d_texture_state_t texture_state;
+        int32_t w = 0, u, v;
+        int tex_offset;
+        rgba_t tex_samples[4];
+        int du, dv;
+        int d[4];
+
+        if (state->w)
+                w = (int32_t)(((1ULL << 27) << 19) / (int64_t)state->w);
+
+        u = (int32_t)(((int64_t)state->u * (int64_t)w) >> (12 + state->max_d)) + state->tbu;
+        v = (int32_t)(((int64_t)state->v * (int64_t)w) >> (12 + state->max_d)) + state->tbv;
+        
+        texture_state.level = (state->d < 0) ? state->max_d : state->max_d - ((state->d >> 27) & 0xf);
+        if (texture_state.level < 0)
+                texture_state.level = 0;
+        texture_state.texture_shift = 18 + (9 - texture_state.level);
+        tex_offset = 1 << texture_state.texture_shift;
+
+        texture_state.u = u;
+        texture_state.v = v;
+        tex_read(state, &texture_state, &tex_samples[0]);
+        du = (u >> (texture_state.texture_shift - 8)) & 0xff;
+        dv = (v >> (texture_state.texture_shift - 8)) & 0xff;
+
+        texture_state.u = u + tex_offset;
+        texture_state.v = v;
+        tex_read(state, &texture_state, &tex_samples[1]);
+
+        texture_state.u = u;
+        texture_state.v = v + tex_offset;
+        tex_read(state, &texture_state, &tex_samples[2]);
+
+        texture_state.u = u + tex_offset;
+        texture_state.v = v + tex_offset;
+        tex_read(state, &texture_state, &tex_samples[3]);
+
+        d[0] = (256 - du) * (256 - dv);
+        d[1] =  du * (256 - dv);
+        d[2] = (256 - du) * dv;
+        d[3] = du * dv;
+        
+        state->dest_rgba.r = (tex_samples[0].r * d[0] + tex_samples[1].r * d[1] + tex_samples[2].r * d[2] + tex_samples[3].r * d[3]) >> 16;
+        state->dest_rgba.g = (tex_samples[0].g * d[0] + tex_samples[1].g * d[1] + tex_samples[2].g * d[2] + tex_samples[3].g * d[3]) >> 16;
+        state->dest_rgba.b = (tex_samples[0].b * d[0] + tex_samples[1].b * d[1] + tex_samples[2].b * d[2] + tex_samples[3].b * d[3]) >> 16;
+        state->dest_rgba.a = (tex_samples[0].a * d[0] + tex_samples[1].a * d[1] + tex_samples[2].a * d[2] + tex_samples[3].a * d[3]) >> 16;
+}
+
+static void tex_sample_persp_mipmap_375(s3d_state_t *state)
+{
+        s3d_texture_state_t texture_state;
+        int32_t w = 0;
+
+        if (state->w)
+                w = (int32_t)(((1ULL << 27) << 19) / (int64_t)state->w);
+        
+        texture_state.level = (state->d < 0) ? state->max_d : state->max_d - ((state->d >> 27) & 0xf);
+        if (texture_state.level < 0)
+                texture_state.level = 0;
+        texture_state.texture_shift = 18 + (9 - texture_state.level);
+        texture_state.u = (int32_t)(((int64_t)state->u * (int64_t)w) >> (8 + state->max_d)) + state->tbu;
+        texture_state.v = (int32_t)(((int64_t)state->v * (int64_t)w) >> (8 + state->max_d)) + state->tbv;
+
+        tex_read(state, &texture_state, &state->dest_rgba);
+}
+
+static void tex_sample_persp_mipmap_filter_375(s3d_state_t *state)
+{
+        s3d_texture_state_t texture_state;
+        int32_t w = 0, u, v;
+        int tex_offset;
+        rgba_t tex_samples[4];
+        int du, dv;
+        int d[4];
+
+        if (state->w)
+                w = (int32_t)(((1ULL << 27) << 19) / (int64_t)state->w);
+
+        u = (int32_t)(((int64_t)state->u * (int64_t)w) >> (8 + state->max_d)) + state->tbu;
+        v = (int32_t)(((int64_t)state->v * (int64_t)w) >> (8 + state->max_d)) + state->tbv;
+        
+        texture_state.level = (state->d < 0) ? state->max_d : state->max_d - ((state->d >> 27) & 0xf);
+        if (texture_state.level < 0)
+                texture_state.level = 0;
+        texture_state.texture_shift = 18 + (9 - texture_state.level);
+        tex_offset = 1 << texture_state.texture_shift;
+        
+        texture_state.u = u;
+        texture_state.v = v;
+        tex_read(state, &texture_state, &tex_samples[0]);
+        du = (u >> (texture_state.texture_shift - 8)) & 0xff;
+        dv = (v >> (texture_state.texture_shift - 8)) & 0xff;
+
+        texture_state.u = u + tex_offset;
+        texture_state.v = v;
+        tex_read(state, &texture_state, &tex_samples[1]);
+
+        texture_state.u = u;
+        texture_state.v = v + tex_offset;
+        tex_read(state, &texture_state, &tex_samples[2]);
+
+        texture_state.u = u + tex_offset;
+        texture_state.v = v + tex_offset;
+        tex_read(state, &texture_state, &tex_samples[3]);
+
+        d[0] = (256 - du) * (256 - dv);
+        d[1] =  du * (256 - dv);
+        d[2] = (256 - du) * dv;
+        d[3] = du * dv;
+        
+        state->dest_rgba.r = (tex_samples[0].r * d[0] + tex_samples[1].r * d[1] + tex_samples[2].r * d[2] + tex_samples[3].r * d[3]) >> 16;
+        state->dest_rgba.g = (tex_samples[0].g * d[0] + tex_samples[1].g * d[1] + tex_samples[2].g * d[2] + tex_samples[3].g * d[3]) >> 16;
+        state->dest_rgba.b = (tex_samples[0].b * d[0] + tex_samples[1].b * d[1] + tex_samples[2].b * d[2] + tex_samples[3].b * d[3]) >> 16;
+        state->dest_rgba.a = (tex_samples[0].a * d[0] + tex_samples[1].a * d[1] + tex_samples[2].a * d[2] + tex_samples[3].a * d[3]) >> 16;
+}
+
+
+#define CLAMP(x)                                     \
+        {                                               \
+                if ((x) & ~0xff)                        \
+                        x = ((x) < 0) ? 0 : 0xff;       \
+        }                               \
+        
+
+#define CLAMP_RGBA(r, g, b, a)        \
+                if ((r) & ~0xff)                        \
+                        r = ((r) < 0) ? 0 : 0xff;       \
+                if ((g) & ~0xff)                        \
+                        g = ((g) < 0) ? 0 : 0xff;       \
+                if ((b) & ~0xff)                        \
+                        b = ((b) < 0) ? 0 : 0xff;       \
+                if ((a) & ~0xff)                        \
+                        a = ((a) < 0) ? 0 : 0xff;
+        
+#define CLAMP_RGB(r, g, b)           \
+        {                               \
+                if ((r) < 0)            \
+                        r = 0;          \
+                if ((r) > 0xff)         \
+                        r = 0xff;       \
+                if ((g) < 0)            \
+                        g = 0;          \
+                if ((g) > 0xff)         \
+                        g = 0xff;       \
+                if ((b) < 0)            \
+                        b = 0;          \
+                if ((b) > 0xff)         \
+                        b = 0xff;       \
+        }                               \
+        
+
+static void dest_pixel_gouraud_shaded_triangle(s3d_state_t *state)
+{
+        state->dest_rgba.r = state->r >> 7;
+        CLAMP(state->dest_rgba.r);
+
+        state->dest_rgba.g = state->g >> 7;
+        CLAMP(state->dest_rgba.g);
+
+        state->dest_rgba.b = state->b >> 7;
+        CLAMP(state->dest_rgba.b);
+
+        state->dest_rgba.a = state->a >> 7;
+        CLAMP(state->dest_rgba.a);
+}
+
+static void dest_pixel_unlit_texture_triangle(s3d_state_t *state)
+{
+        tex_sample(state);
+
+        if (state->cmd_set & CMD_SET_ABC_SRC)
+                state->dest_rgba.a = state->a >> 7;
+}
+
+static void dest_pixel_lit_texture_decal(s3d_state_t *state)
+{
+        tex_sample(state);
+
+        if (state->cmd_set & CMD_SET_ABC_SRC)
+                state->dest_rgba.a = state->a >> 7;
+}
+
+static void dest_pixel_lit_texture_reflection(s3d_state_t *state)
+{
+        tex_sample(state);
+
+        state->dest_rgba.r += (state->r >> 7);
+        state->dest_rgba.g += (state->g >> 7);
+        state->dest_rgba.b += (state->b >> 7);
+        if (state->cmd_set & CMD_SET_ABC_SRC)
+                state->dest_rgba.a += (state->a >> 7);
+
+        CLAMP_RGBA(state->dest_rgba.r, state->dest_rgba.g, state->dest_rgba.b, state->dest_rgba.a);
+}
+
+static void dest_pixel_lit_texture_modulate(s3d_state_t *state)
+{
+        int r = state->r >> 7, g = state->g >> 7, b = state->b >> 7, a = state->a >> 7;
+        
+        tex_sample(state);
+        
+        CLAMP_RGBA(r, g, b, a);
+        
+        state->dest_rgba.r = ((state->dest_rgba.r) * r) >> 8;
+        state->dest_rgba.g = ((state->dest_rgba.g) * g) >> 8;
+        state->dest_rgba.b = ((state->dest_rgba.b) * b) >> 8;
+
+        if (state->cmd_set & CMD_SET_ABC_SRC)
+                state->dest_rgba.a = a;
+}
+
+void tri(s3d_state_t *state, int yc, int32_t dx1, int32_t dx2)
+{
+	int x_dir = s3d_tri.tlr ? 1 : -1;
+        
+	int use_z;
+        use_z = !(s3d_tri.cmd_set & CMD_SET_ZB_MODE);
+
+        int y_count = yc;
+        
+        int bpp = (s3d_tri.cmd_set >> 2) & 7;
+        
+        uint32_t dest_offset, z_offset;
+
+        if (s3d_tri.cmd_set & CMD_SET_HC)
+        {
+                if (state->y < s3d_tri.clip_t)
+                        return;
+                if (state->y > s3d_tri.clip_b)
+                {
+                        int diff_y = state->y - s3d_tri.clip_b;
+                        
+                        if (diff_y > y_count)
+                                diff_y = y_count;
+                        
+                        state->base_u += (s3d_tri.TdUdY * diff_y);
+                        state->base_v += (s3d_tri.TdVdY * diff_y);
+                        state->base_z += (s3d_tri.TdZdY * diff_y);
+                        state->base_r += (s3d_tri.TdRdY * diff_y);
+                        state->base_g += (s3d_tri.TdGdY * diff_y);
+                        state->base_b += (s3d_tri.TdBdY * diff_y);
+                        state->base_a += (s3d_tri.TdAdY * diff_y);
+                        state->base_d += (s3d_tri.TdDdY * diff_y);
+                        state->base_w += (s3d_tri.TdWdY * diff_y);
+                        state->x1 += (dx1 * diff_y);
+                        state->x2 += (dx2 * diff_y);
+                        state->y -= diff_y;
+                        dest_offset -= s3d_tri.dest_str;
+                        z_offset -= s3d_tri.z_str;
+                        y_count -= diff_y;
+                }
+                if ((state->y - y_count) < s3d_tri.clip_t)
+                        y_count = state->y - s3d_tri.clip_t;
+        }
+
+        dest_offset = s3d_tri.dest_base + (state->y * s3d_tri.dest_str);
+        z_offset = s3d_tri.z_base + (state->y * s3d_tri.z_str);
+        
+        for (; y_count > 0; y_count--)
+        {
+                int x  = (state->x1 + ((1 << 20) - 1)) >> 20;
+                int xe = (state->x2 + ((1 << 20) - 1)) >> 20;
+                uint32_t z = (state->base_z > 0) ? (state->base_z << 1) : 0;
+                if (x_dir < 0)
+                {
+                        x--;
+                        xe--;
+                }
+
+                if (x != xe && (x_dir > 0 && x < xe) || (x_dir < 0 && x > xe))
+                {
+                        uint32_t dest_addr, z_addr;
+                        int dx = (x_dir > 0) ? ((31 - ((state->x1-1) >> 15)) & 0x1f) : (((state->x1-1) >> 15) & 0x1f);
+                        int x_offset = x_dir * (bpp + 1);
+                        int xz_offset = x_dir << 1;
+                        if (x_dir > 0)
+                                dx += 1;
+                        state->r = state->base_r + ((s3d_tri.TdRdX * dx) >> 5);
+                        state->g = state->base_g + ((s3d_tri.TdGdX * dx) >> 5);
+                        state->b = state->base_b + ((s3d_tri.TdBdX * dx) >> 5);
+                        state->a = state->base_a + ((s3d_tri.TdAdX * dx) >> 5);
+                        state->u = state->base_u + ((s3d_tri.TdUdX * dx) >> 5);
+                        state->v = state->base_v + ((s3d_tri.TdVdX * dx) >> 5);
+                        state->w = state->base_w + ((s3d_tri.TdWdX * dx) >> 5);
+                        state->d = state->base_d + ((s3d_tri.TdDdX * dx) >> 5);
+                        z += ((s3d_tri.TdZdX * dx) >> 5);
+
+//                        LOG_MSG("Draw Y=%i X=%i to XE=%i  %i   %08x %08x %08x %08x  %08x %08x %08x %08x  %i %08x\n", state->y, x, xe, dx, state->x1, state->x2, dx1, s3d.TdWdX, state->u, state->v, s3d.TdUdX, s3d.TdUdY, dx, (s3d.TdUdX * dx) >> 4);
+
+                        if (s3d_tri.cmd_set & CMD_SET_HC)
+                        {
+                                if (x_dir > 0)
+                                {
+                                        if (x > s3d_tri.clip_r)
+                                                goto tri_skip_line;
+                                        if (xe < s3d_tri.clip_l)
+                                                goto tri_skip_line;
+                                        if (xe > s3d_tri.clip_r)
+                                                xe = s3d_tri.clip_r;
+                                        if (x < s3d_tri.clip_l)
+                                        {
+                                                int diff_x = s3d_tri.clip_l - x;
+                                                
+                                                z += (s3d_tri.TdZdX * diff_x);
+                                                state->u += (s3d_tri.TdUdX * diff_x);
+                                                state->v += (s3d_tri.TdVdX * diff_x);
+                                                state->r += (s3d_tri.TdRdX * diff_x);
+                                                state->g += (s3d_tri.TdGdX * diff_x);
+                                                state->b += (s3d_tri.TdBdX * diff_x);
+                                                state->a += (s3d_tri.TdAdX * diff_x);
+                                                state->d += (s3d_tri.TdDdX * diff_x);
+                                                state->w += (s3d_tri.TdWdX * diff_x);
+                                                
+                                                x = s3d_tri.clip_l;
+                                        }
+                                }
+                                else
+                                {
+                                        if (x < s3d_tri.clip_l)
+                                                goto tri_skip_line;
+                                        if (xe > s3d_tri.clip_r)
+                                                goto tri_skip_line;
+                                        if (xe < s3d_tri.clip_l)
+                                                xe = s3d_tri.clip_l;
+                                        if (x > s3d_tri.clip_r)
+                                        {
+                                                int diff_x = x - s3d_tri.clip_r;
+                                                
+                                                z += (s3d_tri.TdZdX * diff_x);
+                                                state->u += (s3d_tri.TdUdX * diff_x);
+                                                state->v += (s3d_tri.TdVdX * diff_x);
+                                                state->r += (s3d_tri.TdRdX * diff_x);
+                                                state->g += (s3d_tri.TdGdX * diff_x);
+                                                state->b += (s3d_tri.TdBdX * diff_x);
+                                                state->a += (s3d_tri.TdAdX * diff_x);
+                                                state->d += (s3d_tri.TdDdX * diff_x);
+                                                state->w += (s3d_tri.TdWdX * diff_x);
+                                                
+                                                x = s3d_tri.clip_r;
+                                        }
+                                }
+                        }
+
+                        dest_addr = dest_offset + (x * (bpp + 1));
+                        z_addr = z_offset + (x << 1);
+
+                        for (; x != xe; x = (x + x_dir) & 0xfff)
+                        {
+                                int update = 1;
+                                uint16_t src_z;
+                                _x = x; _y = state->y;
+
+                                if (use_z)
+                                {
+                                        src_z = Z_READ(z_addr);
+                                        Z_CLIP(src_z, z >> 16);
+                                }
+
+                                if (update)
+                                {
+                                        uint32_t dest_col;
+
+                                        dest_pixel(state);
+
+                                        if (s3d_tri.cmd_set & CMD_SET_ABC_ENABLE)
+                                        {
+                                                uint32_t src_col;
+                                                int src_r, src_g, src_b;
+                                                
+                                                switch (bpp)
+                                                {
+                                                        case 1: /*16 bpp*/
+							src_col = vga.mem.linear[dest_addr];
+                                                        RGB15_TO_24(src_col, src_r, src_g, src_b);
+                                                        break;
+                                                }
+
+                                                state->dest_rgba.r = ((state->dest_rgba.r * state->dest_rgba.a) + (src_r * (255 - state->dest_rgba.a))) / 255;
+                                                state->dest_rgba.g = ((state->dest_rgba.g * state->dest_rgba.a) + (src_g * (255 - state->dest_rgba.a))) / 255;
+                                                state->dest_rgba.b = ((state->dest_rgba.b * state->dest_rgba.a) + (src_b * (255 - state->dest_rgba.a))) / 255;
+                                        }
+
+                                        switch (bpp)
+                                        {
+                                                case 1: /*16 bpp*/
+                                                RGB15(state->dest_rgba.r, state->dest_rgba.g, state->dest_rgba.b, dest_col);
+                                                // *(uint16_t *)&vram[dest_addr] = dest_col;
+						vga.mem.linear[dest_addr] = dest_col;
+                                                break;
+                                        }
+
+                                        if (use_z && (s3d_tri.cmd_set & CMD_SET_ZUP))
+                                                Z_WRITE(z_addr, src_z);
+                                }
+
+                                z += s3d_tri.TdZdX;
+                                state->u += s3d_tri.TdUdX;
+                                state->v += s3d_tri.TdVdX;
+                                state->r += s3d_tri.TdRdX;
+                                state->g += s3d_tri.TdGdX;
+                                state->b += s3d_tri.TdBdX;
+                                state->a += s3d_tri.TdAdX;
+                                state->d += s3d_tri.TdDdX;
+                                state->w += s3d_tri.TdWdX;
+                                dest_addr += x_offset;
+                                z_addr += xz_offset;
+                                pixel_count++;
+                        }
+                }
+tri_skip_line:
+                state->x1 += dx1;
+                state->x2 += dx2;
+                state->base_u += s3d_tri.TdUdY;
+                state->base_v += s3d_tri.TdVdY;
+                state->base_z += s3d_tri.TdZdY;
+                state->base_r += s3d_tri.TdRdY;
+                state->base_g += s3d_tri.TdGdY;
+                state->base_b += s3d_tri.TdBdY;
+                state->base_a += s3d_tri.TdAdY;
+                state->base_d += s3d_tri.TdDdY;
+                state->base_w += s3d_tri.TdWdY;
+                state->y--;
+                dest_offset -= s3d_tri.dest_str;
+                z_offset -= s3d_tri.z_str;
+        }
+}
+
+static int tex_size[8] =
+{
+        4*2,
+        2*2,
+        2*2,
+        1*2,
+        2/1,
+        2/1,
+        1*2,
+        1*2
+};
+
+void s3_virge_triangle()
+{
+        s3d_state_t state;
+
+        uint32_t tex_base;
+        int c;
+
+        state.tbu = s3d_tri.tbu << 11;
+        state.tbv = s3d_tri.tbv << 11;
+        
+        state.max_d = (s3d_tri.cmd_set >> 8) & 15;
+        
+        state.tex_bdr_clr = s3d_tri.tex_bdr_clr;
+        
+        state.cmd_set = s3d_tri.cmd_set;
+
+        state.base_u = s3d_tri.tus;
+        state.base_v = s3d_tri.tvs;
+        state.base_z = s3d_tri.tzs;
+        state.base_r = (int32_t)s3d_tri.trs;
+        state.base_g = (int32_t)s3d_tri.tgs;
+        state.base_b = (int32_t)s3d_tri.tbs;
+        state.base_a = (int32_t)s3d_tri.tas;
+        state.base_d = s3d_tri.tds;
+        state.base_w = s3d_tri.tws;
+        
+        tex_base = s3d_tri.tex_base;
+        for (c = 9; c >= 0; c--)
+        {
+               // state.texture[c] = (uint16_t *)&svga.vram[tex_base];
+		// verify the types are correct
+		state.texture[c] = (uint16_t *)&vga.mem.linear[tex_base];
+                if (c <= state.max_d)
+                        tex_base += ((1 << (c*2)) * tex_size[(s3d_tri.cmd_set >> 5) & 7]) / 2;
+        }
+
+        switch ((s3d_tri.cmd_set >> 27) & 0xf)
+        {
+                case 0:
+                dest_pixel = dest_pixel_gouraud_shaded_triangle;
+//              LOG_MSG("dest_pixel_gouraud_shaded_triangle\n");
+                break;
+                case 1:
+                case 5:
+                switch ((s3d_tri.cmd_set >> 15) & 0x3)
+                {
+                        case 0:
+                        dest_pixel = dest_pixel_lit_texture_reflection;
+//                        LOG_MSG("dest_pixel_lit_texture_reflection\n");
+                        break;
+                        case 1:
+                        dest_pixel = dest_pixel_lit_texture_modulate;
+//                        LOG_MSG("dest_pixel_lit_texture_modulate\n");
+                        break;
+                        case 2:
+                        dest_pixel = dest_pixel_lit_texture_decal;
+//                        LOG_MSG("dest_pixel_lit_texture_decal\n");
+                        break;
+                        default:
+                        LOG_MSG("bad triangle type %x\n", (s3d_tri.cmd_set >> 27) & 0xf);
+                        return;
+                }
+                break;
+                case 2:
+                case 6:
+                dest_pixel = dest_pixel_unlit_texture_triangle;
+//                LOG_MSG("dest_pixel_unlit_texture_triangle\n");
+                break;
+                default:
+                LOG_MSG("bad triangle type %x\n", (s3d_tri.cmd_set >> 27) & 0xf);
+                return;
+        }        
+        
+        switch (((s3d_tri.cmd_set >> 12) & 7) | ((s3d_tri.cmd_set & (1 << 29)) ? 8 : 0))
+        {
+                case 0: case 1:
+                tex_sample = tex_sample_mipmap;
+//                LOG_MSG("use tex_sample_mipmap\n");
+                break;
+                case 2: case 3:
+                tex_sample = bilinear_enabled ? tex_sample_mipmap_filter : tex_sample_mipmap;
+//                LOG_MSG("use tex_sample_mipmap_filter\n");
+                break;
+                case 4: case 5:
+                tex_sample = tex_sample_normal;
+//                LOG_MSG("use tex_sample_normal\n");
+                break;
+                case 6: case 7:
+                tex_sample = bilinear_enabled ? tex_sample_normal_filter : tex_sample_normal;
+//                LOG_MSG("use tex_sample_normal_filter\n");
+                break;
+                case (0 | 8): case (1 | 8):
+                if (is_375)
+                        tex_sample = tex_sample_persp_mipmap_375;
+                else
+                        tex_sample = tex_sample_persp_mipmap;
+//                LOG_MSG("use tex_sample_persp_mipmap\n");
+                break;
+                case (2 | 8): case (3 | 8):
+                if (is_375)
+                        tex_sample = bilinear_enabled ? tex_sample_persp_mipmap_filter_375 : tex_sample_persp_mipmap_375;
+                else
+                        tex_sample = bilinear_enabled ? tex_sample_persp_mipmap_filter : tex_sample_persp_mipmap;
+//                LOG_MSG("use tex_sample_persp_mipmap_filter\n");
+                break;
+                case (4 | 8): case (5 | 8):
+                if (is_375)
+                        tex_sample = tex_sample_persp_normal_375;
+                else
+                        tex_sample = tex_sample_persp_normal;
+//                LOG_MSG("use tex_sample_persp_normal\n");
+                break;
+                case (6 | 8): case (7 | 8):
+                if (is_375)
+                        tex_sample = bilinear_enabled ? tex_sample_persp_normal_filter_375 : tex_sample_persp_normal_375;
+                else
+                        tex_sample = bilinear_enabled ? tex_sample_persp_normal_filter : tex_sample_persp_normal;
+//                LOG_MSG("use tex_sample_persp_normal_filter\n");
+                break;
+        }
+        
+        switch ((s3d_tri.cmd_set >> 5) & 7)
+        {
+                case 0:
+                tex_read = (s3d_tri.cmd_set & CMD_SET_TWE) ? tex_ARGB8888 : tex_ARGB8888_nowrap;
+                break;
+                case 1:
+                tex_read = (s3d_tri.cmd_set & CMD_SET_TWE) ? tex_ARGB4444 : tex_ARGB4444_nowrap;
+//                LOG_MSG("tex_ARGB4444\n");
+                break;
+                case 2:
+                tex_read = (s3d_tri.cmd_set & CMD_SET_TWE) ? tex_ARGB1555 : tex_ARGB1555_nowrap;
+//                LOG_MSG("tex_ARGB1555 %i\n", (s3d_tri.cmd_set >> 5) & 7);
+                break;
+                default:
+                LOG_MSG("bad texture type %i\n", (s3d_tri.cmd_set >> 5) & 7);
+                tex_read = (s3d_tri.cmd_set & CMD_SET_TWE) ? tex_ARGB1555 : tex_ARGB1555_nowrap;
+                break;
+        }
+        
+//        LOG_MSG("Triangle %i %i,%i to %i,%i  %08x\n", y, x1 >> 20, y, s3d_tri.txend01 >> 20, y - (s3d_tri.ty01 + s3d_tri.ty12), state.cmd_set);
+
+        state.y  = s3d_tri.tys;
+        state.x1 = s3d_tri.txs;
+        state.x2 = s3d_tri.txend01;
+        tri(&state, s3d_tri.ty01, s3d_tri.TdXdY02, s3d_tri.TdXdY01);
+        state.x2 = s3d_tri.txend12;
+        tri(&state, s3d_tri.ty12, s3d_tri.TdXdY02, s3d_tri.TdXdY12);
+}
+
+#define DECODE_YCbCr()                                                  \
+        {                                                               \
+                int c;                                                  \
+                                                                        \
+                for (c = 0; c < 2; c++)                                 \
+                {                                                       \
+                        uint8_t y1, y2;                                 \
+                        int8_t Cr, Cb;                                  \
+                        int dR, dG, dB;                                 \
+                                                                        \
+                        y1 = src[0];                                    \
+                        Cr = src[1] - 0x80;                             \
+                        y2 = src[2];                                    \
+                        Cb = src[3] - 0x80;                             \
+                        src += 4;                                       \
+                                                                        \
+                        dR = (359*Cr) >> 8;                             \
+                        dG = (88*Cb + 183*Cr) >> 8;                     \
+                        dB = (453*Cb) >> 8;                             \
+                                                                        \
+                        r[x_write] = y1 + dR;                           \
+                        CLAMP(r[x_write]);                              \
+                        g[x_write] = y1 - dG;                           \
+                        CLAMP(g[x_write]);                              \
+                        b[x_write] = y1 + dB;                           \
+                        CLAMP(b[x_write]);                              \
+                                                                        \
+                        r[x_write+1] = y2 + dR;                         \
+                        CLAMP(r[x_write+1]);                            \
+                        g[x_write+1] = y2 - dG;                         \
+                        CLAMP(g[x_write+1]);                            \
+                        b[x_write+1] = y2 + dB;                         \
+                        CLAMP(b[x_write+1]);                            \
+                                                                        \
+                        x_write = (x_write + 2) & 7;                    \
+                }                                                       \
+        } 
+
+/*Both YUV formats are untested*/
+#define DECODE_YUV211()                                         \
+        {                                                       \
+                uint8_t y1, y2, y3, y4;                         \
+                int8_t U, V;                                    \
+                int dR, dG, dB;                                 \
+                                                                \
+                U = src[0] - 0x80;                              \
+                y1 = (298 * (src[1] - 16)) >> 8;                \
+                y2 = (298 * (src[2] - 16)) >> 8;                \
+                V = src[3] - 0x80;                              \
+                y3 = (298 * (src[4] - 16)) >> 8;                \
+                y4 = (298 * (src[5] - 16)) >> 8;                \
+                src += 6;                                       \
+                                                                \
+                dR = (309*V) >> 8;                              \
+                dG = (100*U + 208*V) >> 8;                      \
+                dB = (516*U) >> 8;                              \
+                                                                \
+                r[x_write] = y1 + dR;                           \
+                CLAMP(r[x_write]);                              \
+                g[x_write] = y1 - dG;                           \
+                CLAMP(g[x_write]);                              \
+                b[x_write] = y1 + dB;                           \
+                CLAMP(b[x_write]);                              \
+                                                                \
+                r[x_write+1] = y2 + dR;                         \
+                CLAMP(r[x_write+1]);                            \
+                g[x_write+1] = y2 - dG;                         \
+                CLAMP(g[x_write+1]);                            \
+                b[x_write+1] = y2 + dB;                         \
+                CLAMP(b[x_write+1]);                            \
+                                                                \
+                r[x_write+2] = y2 + dR;                         \
+                CLAMP(r[x_write+2]);                            \
+                g[x_write+2] = y2 - dG;                         \
+                CLAMP(g[x_write+2]);                            \
+                b[x_write+2] = y2 + dB;                         \
+                CLAMP(b[x_write+2]);                            \
+                                                                \
+                r[x_write+3] = y2 + dR;                         \
+                CLAMP(r[x_write+3]);                            \
+                g[x_write+3] = y2 - dG;                         \
+                CLAMP(g[x_write+3]);                            \
+                b[x_write+3] = y2 + dB;                         \
+                CLAMP(b[x_write+3]);                            \
+                                                                \
+                x_write = (x_write + 4) & 7;                    \
+        } 
+
+#define DECODE_YUV422()                                                 \
+        {                                                               \
+                int c;                                                  \
+                                                                        \
+                for (c = 0; c < 2; c++)                                 \
+                {                                                       \
+                        uint8_t y1, y2;                                 \
+                        int8_t U, V;                                    \
+                        int dR, dG, dB;                                 \
+                                                                        \
+                        U = src[0] - 0x80;                              \
+                        y1 = (298 * (src[1] - 16)) >> 8;                \
+                        V = src[2] - 0x80;                              \
+                        y2 = (298 * (src[3] - 16)) >> 8;                \
+                        src += 4;                                       \
+                                                                        \
+                        dR = (309*V) >> 8;                              \
+                        dG = (100*U + 208*V) >> 8;                      \
+                        dB = (516*U) >> 8;                              \
+                                                                        \
+                        r[x_write] = y1 + dR;                           \
+                        CLAMP(r[x_write]);                              \
+                        g[x_write] = y1 - dG;                           \
+                        CLAMP(g[x_write]);                              \
+                        b[x_write] = y1 + dB;                           \
+                        CLAMP(b[x_write]);                              \
+                                                                        \
+                        r[x_write+1] = y2 + dR;                         \
+                        CLAMP(r[x_write+1]);                            \
+                        g[x_write+1] = y2 - dG;                         \
+                        CLAMP(g[x_write+1]);                            \
+                        b[x_write+1] = y2 + dB;                         \
+                        CLAMP(b[x_write+1]);                            \
+                                                                        \
+                        x_write = (x_write + 2) & 7;                    \
+                }                                                       \
+        } 
+
+#define DECODE_RGB555()                                                 \
+        {                                                               \
+                int c;                                                  \
+                                                                        \
+                for (c = 0; c < 4; c++)                                 \
+                {                                                       \
+                        uint16_t dat;                                   \
+                                                                        \
+                        dat = *(uint16_t *)src;                         \
+                        src += 2;                                       \
+                                                                        \
+                        r[x_write + c] = ((dat & 0x001f) << 3) | ((dat & 0x001f) >> 2); \
+                        g[x_write + c] = ((dat & 0x03e0) >> 2) | ((dat & 0x03e0) >> 7); \
+                        b[x_write + c] = ((dat & 0x7c00) >> 7) | ((dat & 0x7c00) >> 12); \
+                }                                                       \
+                x_write = (x_write + 4) & 7;                            \
+        } 
+
+#define DECODE_RGB565()                                                 \
+        {                                                               \
+                int c;                                                  \
+                                                                        \
+                for (c = 0; c < 4; c++)                                 \
+                {                                                       \
+                        uint16_t dat;                                   \
+                                                                        \
+                        dat = *(uint16_t *)src;                         \
+                        src += 2;                                       \
+                                                                        \
+                        r[x_write + c] = ((dat & 0x001f) << 3) | ((dat & 0x001f) >> 2); \
+                        g[x_write + c] = ((dat & 0x07e0) >> 3) | ((dat & 0x07e0) >> 9); \
+                        b[x_write + c] = ((dat & 0xf800) >> 8) | ((dat & 0xf800) >> 13); \
+                }                                                       \
+                x_write = (x_write + 4) & 7;                            \
+        } 
+
+#define DECODE_RGB888()                                                 \
+        {                                                               \
+                int c;                                                  \
+                                                                        \
+                for (c = 0; c < 4; c++)                                 \
+                {                                                       \
+                        r[x_write + c] = src[0];                        \
+                        g[x_write + c] = src[1];                        \
+                        b[x_write + c] = src[2];                        \
+                        src += 3;                                       \
+                }                                                       \
+                x_write = (x_write + 4) & 7;                            \
+        } 
+
+#define DECODE_XRGB8888()                                               \
+        {                                                               \
+                int c;                                                  \
+                                                                        \
+                for (c = 0; c < 4; c++)                                 \
+                {                                                       \
+                        r[x_write + c] = src[0];                        \
+                        g[x_write + c] = src[1];                        \
+                        b[x_write + c] = src[2];                        \
+                        src += 4;                                       \
+                }                                                       \
+                x_write = (x_write + 4) & 7;                            \
+        }